1.安装相关包

pip install transformers torch einops
pip install numpy
sudo apt install nvidia-cuda-toolkit
pip install flash-attn --no-build-isolation

 flash-attna包安装不了问题解决

python -m pip install ninja -i https://pypi.tuna.tsinghua.edu.cn/simple
git clone https://github.com/Dao-AILab/flash-attention
cd flash-attention
git submodule update --init --recursive
python -m pip install wheel==0.41.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
python  setup.py install    #需要一段时间

2.下载模型

import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'  # 国内镜像加速
from huggingface_hub import snapshot_download

# 下载完整模型(含配置、权重、分词器)
snapshot_download(
    repo_id="jinaai/jina-embeddings-v3",
    local_dir="./jina-embeddings-v3",  # 本地保存路径
    revision="main",  # 默认最新版本
    local_dir_use_symlinks=False,  # 避免符号链接
    token=None,  # 公开模型无需 token
    resume_download=True  # 断点续传
)

3.模型嵌入功能使用

import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
from transformers import AutoModel

model = AutoModel.from_pretrained(
    "./jina-embeddings-v3",
    trust_remote_code=True,
    local_files_only=False).to("cuda")

texts = [
    "Follow the white rabbit.",  # English
    "Sigue al conejo blanco.",  # Spanish
    "Suis le lapin blanc.",  # French
    "跟着白兔走。",  # Chinese
    "اتبع الأرنب الأبيض.",  # Arabic
    "Folge dem weißen Kaninchen.",  # German
]

# 'retrieval.query', 'retrieval.passage', 'separation', 'classification', 'text-matching'
# max_length最大序列长度为8192
# truncate_dim换到不同的维度
embeddings = model.encode(texts, task="text-matching", max_length=2048, truncate_dim=256)

print(embeddings[0] @ embeddings[1].T)

4.SentenceTransformer

4.1安装包

pip install -U sentence-transformers
pip install onnxruntime

4.2嵌入功能

from sentence_transformers import SentenceTransformer
import torch

# 初始化模型
model = SentenceTransformer(
    "./jina-embeddings-v3",
    trust_remote_code=True,
    device="cuda" if torch.cuda.is_available() else "cpu"
)

# task:classification, retrieval.query, retrieval.passage, text-matching, separation
#分类,检索-查询,检索-文档,文本匹配,分离
task = "retrieval.query"
texts = [
    "Follow the white rabbit.",  # English
    "Sigue al conejo blanco.",  # Spanish
    "Suis le lapin blanc.",  # French
    "跟着白兔走。",  # Chinese
    "اتبع الأرنب الأبيض.",  # Arabic
    "Folge dem weißen Kaninchen.",  # German
]

# 生成嵌入向量
embeddings = model.encode(
    texts,
    task=task,
    prompt_name="retrieval.query", 
    batch_size=32,
    convert_to_tensor=True  # 返回PyTorch张量
)

# 计算相似度(示例:第一条文本与自身的相似度)
similarity = embeddings[0] @ embeddings[0].T
print(f"Similarity score: {similarity.item():.4f}")

# 保存嵌入结果
torch.save(embeddings, "embeddings.pt")

4.3ONNX推理

import numpy as np
from transformers import AutoTokenizer, AutoModel, PretrainedConfig
import onnxruntime

def process_texts(texts: list, task_type: str = 'text-matching'):
    # 封装预处理和推理过程
    tokenizer = AutoTokenizer.from_pretrained('./jina-embeddings-v3')
    config = PretrainedConfig.from_pretrained('./jina-embeddings-v3')
    
    # Tokenize
    inputs = tokenizer(texts, padding=True, return_tensors='np')
    
    # ONNX推理
    session = onnxruntime.InferenceSession('./jina-embeddings-v3/onnx/model.onnx')
    task_id = np.array([config.lora_adaptations.index(task_type)], dtype=np.int64)
    outputs = session.run(
        None, 
        {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'task_id': task_id
        }
    )[0]
    
    # 池化+归一化
    embeddings = outputs.mean(axis=1)  # 简化版池化(或使用你的mean_pooling)
    embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
    return embeddings

if __name__ == "__main__":
    # 测试样本
    texts = [
        "Natural language processing",
        "自然语言处理",
        "Computer science"
    ]
    
    # 生成嵌入
    embeddings = process_texts(texts)
    
    # 1. 形状验证
    assert embeddings.shape == (3, 1024), f"形状错误: {embeddings.shape}"
    
    # 2. 相似度验证
    sim_matrix = embeddings @ embeddings.T
    print("相似度矩阵:\n", np.round(sim_matrix, 2))
    
    # 中英文相似度应较高
    en_zh_sim = sim_matrix[0, 1]
    assert en_zh_sim > 0.7, f"跨语言相似度过低: {en_zh_sim}"
    
    # 不同领域相似度应较低
    diff_sim = sim_matrix[0, 2]
    assert diff_sim < 0.5, f"差异文本相似度过高: {diff_sim}"
    
    print("所有验证通过!")

Logo

有“AI”的1024 = 2048,欢迎大家加入2048 AI社区

更多推荐