jina-embeddings-v3模型的安装与使用(512维)
Ubuntu中jina-embeddings-v3模型的安装与使用
·
1.安装相关包
pip install transformers torch einops
pip install numpy
sudo apt install nvidia-cuda-toolkit
pip install flash-attn --no-build-isolation
flash-attna包安装不了问题解决
python -m pip install ninja -i https://pypi.tuna.tsinghua.edu.cn/simple
git clone https://github.com/Dao-AILab/flash-attention
cd flash-attention
git submodule update --init --recursive
python -m pip install wheel==0.41.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
python setup.py install #需要一段时间
2.下载模型
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' # 国内镜像加速
from huggingface_hub import snapshot_download
# 下载完整模型(含配置、权重、分词器)
snapshot_download(
repo_id="jinaai/jina-embeddings-v3",
local_dir="./jina-embeddings-v3", # 本地保存路径
revision="main", # 默认最新版本
local_dir_use_symlinks=False, # 避免符号链接
token=None, # 公开模型无需 token
resume_download=True # 断点续传
)
3.模型嵌入功能使用
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
from transformers import AutoModel
model = AutoModel.from_pretrained(
"./jina-embeddings-v3",
trust_remote_code=True,
local_files_only=False).to("cuda")
texts = [
"Follow the white rabbit.", # English
"Sigue al conejo blanco.", # Spanish
"Suis le lapin blanc.", # French
"跟着白兔走。", # Chinese
"اتبع الأرنب الأبيض.", # Arabic
"Folge dem weißen Kaninchen.", # German
]
# 'retrieval.query', 'retrieval.passage', 'separation', 'classification', 'text-matching'
# max_length最大序列长度为8192
# truncate_dim换到不同的维度
embeddings = model.encode(texts, task="text-matching", max_length=2048, truncate_dim=256)
print(embeddings[0] @ embeddings[1].T)
4.SentenceTransformer
4.1安装包
pip install -U sentence-transformers
pip install onnxruntime
4.2嵌入功能
from sentence_transformers import SentenceTransformer
import torch
# 初始化模型
model = SentenceTransformer(
"./jina-embeddings-v3",
trust_remote_code=True,
device="cuda" if torch.cuda.is_available() else "cpu"
)
# task:classification, retrieval.query, retrieval.passage, text-matching, separation
#分类,检索-查询,检索-文档,文本匹配,分离
task = "retrieval.query"
texts = [
"Follow the white rabbit.", # English
"Sigue al conejo blanco.", # Spanish
"Suis le lapin blanc.", # French
"跟着白兔走。", # Chinese
"اتبع الأرنب الأبيض.", # Arabic
"Folge dem weißen Kaninchen.", # German
]
# 生成嵌入向量
embeddings = model.encode(
texts,
task=task,
prompt_name="retrieval.query",
batch_size=32,
convert_to_tensor=True # 返回PyTorch张量
)
# 计算相似度(示例:第一条文本与自身的相似度)
similarity = embeddings[0] @ embeddings[0].T
print(f"Similarity score: {similarity.item():.4f}")
# 保存嵌入结果
torch.save(embeddings, "embeddings.pt")
4.3ONNX推理
import numpy as np
from transformers import AutoTokenizer, AutoModel, PretrainedConfig
import onnxruntime
def process_texts(texts: list, task_type: str = 'text-matching'):
# 封装预处理和推理过程
tokenizer = AutoTokenizer.from_pretrained('./jina-embeddings-v3')
config = PretrainedConfig.from_pretrained('./jina-embeddings-v3')
# Tokenize
inputs = tokenizer(texts, padding=True, return_tensors='np')
# ONNX推理
session = onnxruntime.InferenceSession('./jina-embeddings-v3/onnx/model.onnx')
task_id = np.array([config.lora_adaptations.index(task_type)], dtype=np.int64)
outputs = session.run(
None,
{
'input_ids': inputs['input_ids'],
'attention_mask': inputs['attention_mask'],
'task_id': task_id
}
)[0]
# 池化+归一化
embeddings = outputs.mean(axis=1) # 简化版池化(或使用你的mean_pooling)
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
return embeddings
if __name__ == "__main__":
# 测试样本
texts = [
"Natural language processing",
"自然语言处理",
"Computer science"
]
# 生成嵌入
embeddings = process_texts(texts)
# 1. 形状验证
assert embeddings.shape == (3, 1024), f"形状错误: {embeddings.shape}"
# 2. 相似度验证
sim_matrix = embeddings @ embeddings.T
print("相似度矩阵:\n", np.round(sim_matrix, 2))
# 中英文相似度应较高
en_zh_sim = sim_matrix[0, 1]
assert en_zh_sim > 0.7, f"跨语言相似度过低: {en_zh_sim}"
# 不同领域相似度应较低
diff_sim = sim_matrix[0, 2]
assert diff_sim < 0.5, f"差异文本相似度过高: {diff_sim}"
print("所有验证通过!")
更多推荐
所有评论(0)