MiniCPM-Embedding

Name: MiniCPM-Embedding
Author: openbmb

40.1K

248

2 languages

—

openbmb

Embedding Model

OTHER

Fair

40K downloads

Community-tested

Try on Hugging Face Add to Compare

Edge AI:

Mobile

Laptop

Server

Unknown

Mobile

Laptop

Server

Quick Summary

MiniCPM-Embedding 是面壁智能与清华大学自然语言处理实验室（THUNLP）、东北大学信息检索小组（NEUIR）共同开发的中英双语言文本嵌入模型，有如下特点： - 出色的中文、英文检索能力。 - 出色的中英跨语言检索能力。 MiniCPM-Embedding 基于 MiniCPM-2B-sft-bf16...

Code Examples

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

使用方法 Usagetext

Instruction: {{ instruction }} Query: {{ query }}

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

输入格式 Input Formattext

Instruction: 为这个医学问题检索相关回答。Query: 咽喉癌的成因是什么？

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

text

Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: However the warming trend is slower than most climate models have forecast.

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]

环境要求 Requirementspythontransformers

from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F

model_name = "openbmb/MiniCPM-Embedding"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
# You can also use the following line to enable the Flash Attention 2 implementation
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
model.eval()

# 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
# As we scale hidden states in `model.forward`, mean pooling here actually works as weighted mean pooling
def mean_pooling(hidden, attention_mask):
    s = torch.sum(hidden * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps

@torch.no_grad()
def encode(input_texts):
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True).to("cuda")
    
    outputs = model(**batch_dict)
    attention_mask = batch_dict["attention_mask"]
    hidden = outputs.last_hidden_state

    reps = mean_pooling(hidden, attention_mask)   
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings

queries = ["中国的首都是哪里？"]
passages = ["beijing", "shanghai"]


INSTRUCTION = "Query: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries)
embeddings_doc = encode(passages)

scores = (embeddings_query @ embeddings_doc.T)
print(scores.tolist())  # [[0.3535913825035095, 0.18596848845481873]]