GTE-multilingual-base 本地向量服务部署与 Spring AI 集成方案

文章摘要本文介绍了一个本地化部署的GTE-multilingual-base向量服务方案，适用于内网环境和数据安全要求高的场景。方案采用Docker容器化部署，具备以下特点：无GPU要求，纯CPU环境下运行支持单文本和批量文本向量化处理通过FastAPI提供统一接口包含Spring AI集成方案采用Docker Compose实现一键部署系统架构包含向量模型服务层和Spring应用层

周全全

801人浏览 · 2025-11-18 10:29:20

周全全 · 2025-11-18 10:29:20 发布

GTE-multilingual-base 本地向量服务部署与 Spring AI 集成方案

前言

在构建 RAG（检索增强生成）系统时，文本向量化是核心环节之一。虽然云服务提供了便捷的 Embedding API，但在内网环境、数据安全要求高或需要控制成本的场景下，本地部署向量服务是更好的选择。

本文档详细介绍了如何在 无 GPU 的服务器 上部署 gte-multilingual-base 向量模型，并通过 FastAPI 提供统一的向量化服务接口。同时，我们将展示如何将其无缝集成到 Spring AI 项目中，实现完整的 RAG 系统。

本文执行方案基于claude4.5生成，已经验证可行

方案特点

Docker 一键部署：简化部署流程，降低运维成本
批量处理支持：支持单文本和批量文本向量化
CPU 优化：针对 CPU 环境进行性能优化
模型缓存复用：挂载 HuggingFace 缓存目录，避免重复下载
Spring AI 集成：提供完整的 Spring Boot 集成示例

一、系统架构

1.1 整体架构

1.2 技术栈

向量模型：Alibaba-NLP/gte-multilingual-base（768 维向量）
运行环境：CPU（无 GPU 要求）
服务框架：FastAPI + Uvicorn
容器化：Docker + Docker Compose
集成框架：Spring Boot + Spring AI

二、项目结构

embedding-service/
├── app.py              # FastAPI 主服务代码
├── requirements.txt    # Python 依赖
├── Dockerfile          # 镜像构建
└── docker-compose.yml  # 一键部署

三、核心文件内容

1. app.py

"""
GTE-multilingual-base FastAPI 推理服务
提供文本向量化 API，支持单文本和批量处理
"""
from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from typing import List
import torch
from sentence_transformers import SentenceTransformer
import logging
import time

# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# 创建 FastAPI 应用
app = FastAPI(
    title="GTE Embedding Service",
    description="Alibaba-NLP/gte-multilingual-base 文本向量化服务",
    version="1.0.0"
)

# 全局变量
model = None
MODEL_NAME = "Alibaba-NLP/gte-multilingual-base"
DIMENSION = 768


class EmbedRequest(BaseModel):
    """单文本向量化请求"""
    text: str


class EmbedBatchRequest(BaseModel):
    """批量文本向量化请求"""
    texts: List[str]


class EmbedResponse(BaseModel):
    """向量化响应"""
    embedding: List[float]
    dimension: int


class EmbedBatchResponse(BaseModel):
    """批量向量化响应"""
    embeddings: List[List[float]]
    dimension: int
    count: int


class HealthResponse(BaseModel):
    """健康检查响应"""
    status: str
    model: str
    dimension: int
    device: str


@app.on_event("startup")
async def load_model():
    """启动时加载模型"""
    global model
    try:
        logger.info(f"开始加载模型: {MODEL_NAME}")
        start_time = time.time()
        
        # 加载模型（自动下载到 ~/.cache/huggingface/）
        # trust_remote_code=True: GTE 模型使用自定义配置代码
        model = SentenceTransformer(MODEL_NAME, trust_remote_code=True)
        
        # CPU 优化设置
        if torch.cuda.is_available():
            logger.info("检测到 GPU，使用 CUDA")
            model = model.cuda()
        else:
            logger.info("使用 CPU 推理")
            # CPU 优化
            torch.set_num_threads(4)  # 根据 CPU 核心数调整
        
        load_time = time.time() - start_time
        logger.info(f"模型加载成功！耗时: {load_time:.2f}秒")
        logger.info(f"向量维度: {DIMENSION}")
        
    except Exception as e:
        logger.error(f"模型加载失败: {e}")
        raise


@app.get("/", response_model=dict)
async def root():
    """根路径"""
    return {
        "service": "GTE Embedding Service",
        "model": MODEL_NAME,
        "dimension": DIMENSION,
        "endpoints": {
            "health": "/health",
            "embed": "/embed (POST)",
            "embed_batch": "/embed_batch (POST)"
        }
    }


@app.get("/health", response_model=HealthResponse)
async def health_check():
    """健康检查接口"""
    if model is None:
        raise HTTPException(status_code=503, detail="模型未加载")
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    return HealthResponse(
        status="healthy",
        model=MODEL_NAME,
        dimension=DIMENSION,
        device=device
    )


@app.post("/embed", response_model=EmbedResponse)
async def embed_text(request: EmbedRequest):
    """
    单文本向量化
    
    请求示例:
    {
        "text": "这是一段测试文本"
    }
    """
    if model is None:
        raise HTTPException(status_code=503, detail="模型未加载")
    
    if not request.text or not request.text.strip():
        raise HTTPException(status_code=400, detail="文本不能为空")
    
    try:
        start_time = time.time()
        
        # 生成向量
        embedding = model.encode(request.text, convert_to_numpy=True)
        
        # 转换为列表
        embedding_list = embedding.tolist()
        
        inference_time = time.time() - start_time
        logger.info(f"向量生成成功，耗时: {inference_time*1000:.2f}ms")
        
        return EmbedResponse(
            embedding=embedding_list,
            dimension=len(embedding_list)
        )
        
    except Exception as e:
        logger.error(f"向量生成失败: {e}")
        raise HTTPException(status_code=500, detail=f"向量生成失败: {str(e)}")


@app.post("/embed_batch", response_model=EmbedBatchResponse)
async def embed_texts(request: EmbedBatchRequest):
    """
    批量文本向量化
    
    请求示例:
    {
        "texts": ["文本1", "文本2", "文本3"]
    }
    """
    if model is None:
        raise HTTPException(status_code=503, detail="模型未加载")
    
    if not request.texts or len(request.texts) == 0:
        raise HTTPException(status_code=400, detail="文本列表不能为空")
    
    # 过滤空文本
    valid_texts = [text for text in request.texts if text and text.strip()]
    if not valid_texts:
        raise HTTPException(status_code=400, detail="没有有效的文本")
    
    try:
        start_time = time.time()
        
        # 批量生成向量
        embeddings = model.encode(valid_texts, convert_to_numpy=True, batch_size=32)
        
        # 转换为列表
        embeddings_list = embeddings.tolist()
        
        inference_time = time.time() - start_time
        logger.info(f"批量向量生成成功，数量: {len(embeddings_list)}, 耗时: {inference_time:.2f}s")
        
        return EmbedBatchResponse(
            embeddings=embeddings_list,
            dimension=DIMENSION,
            count=len(embeddings_list)
        )
        
    except Exception as e:
        logger.error(f"批量向量生成失败: {e}")
        raise HTTPException(status_code=500, detail=f"批量向量生成失败: {str(e)}")


@app.exception_handler(Exception)
async def global_exception_handler(request, exc):
    """全局异常处理"""
    logger.error(f"未处理的异常: {exc}")
    return JSONResponse(
        status_code=500,
        content={"detail": "服务器内部错误"}
    )


if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=5000)

2. requirements.txt

# Web 框架
fastapi==0.104.1
uvicorn[standard]==0.24.0
pydantic==2.5.0

# 深度学习框架
torch==2.1.2
transformers==4.36.0
sentence-transformers==2.3.1

# 基础库
numpy==1.24.3
tokenizers==0.15.0

3. Dockerfile

# 使用官方 Python 镜像
FROM python:3.10-slim

# 设置工作目录
WORKDIR /app

# 设置环境变量
ENV PYTHONUNBUFFERED=1 \
    PYTHONDONTWRITEBYTECODE=1 \
    PIP_NO_CACHE_DIR=1 \
    PIP_DISABLE_PIP_VERSION_CHECK=1

# 更换为阿里云镜像源并安装系统依赖
RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources && \
    apt-get update && apt-get install -y --no-install-recommends \
    build-essential \
    curl \
    && rm -rf /var/lib/apt/lists/*

# 复制依赖文件
COPY requirements.txt .

# 安装 Python 依赖（使用阿里云镜像源）
RUN pip install --no-cache-dir -i https://mirrors.aliyun.com/pypi/simple/ -r requirements.txt

# 复制应用代码
COPY app.py .

# 暴露端口
EXPOSE 5000

# 健康检查
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
    CMD curl -f http://localhost:5000/health || exit 1

# 启动命令
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "5000", "--workers", "1"]

4. docker-compose.yml

version: '3.8'

services:
  embedding-service:
    build:
      context: .
      dockerfile: Dockerfile
    container_name: gte-embedding-service
    ports:
      - "5000:5000"
    environment:
      # CPU 优化设置
      - OMP_NUM_THREADS=4
      - MKL_NUM_THREADS=4
      # 使用 HuggingFace 镜像站（国内访问）
      - HF_ENDPOINT=https://hf-mirror.com
    volumes:
      # 挂载模型缓存目录，避免重复下载
      - ~/.cache/huggingface:/root/.cache/huggingface
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:5000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
    # CPU 资源限制（根据实际情况调整）
    deploy:
      resources:
        limits:
          cpus: '8'
          memory: 8G
        reservations:
          cpus: '4'
          memory: 4G

四、部署步骤

Docker 部署

上传embedding-service文件夹到服务器

cd embedding-service
docker-compose up -d

执行输出：

[root@manager168 embedding-service]# docker compose up -d --build
[+] Building 531.9s (11/11) FINISHED docker:default
=> [embedding-service internal] load build definition from Dockerfile 0.0s
=> => transferring dockerfile: 1.09kB 0.0s
=> [embedding-service internal] load metadata for docker.io/library/python:3.10-slim 0.5s
=> [embedding-service internal] load .dockerignore 0.0s
=> => transferring context: 2B 0.0s
=> [embedding-service 1/6] FROM docker.io/library/python:3.10-slim@sha256:975a1e200a16719060d391eea4ac66ee067d709cc22a32f4ca4737731eae36c0 0.0s
=> [embedding-service internal] load build context 0.0s
=> => transferring context: 63B 0.0s
=> CACHED [embedding-service 2/6] WORKDIR /app 0.0s
=> CACHED [embedding-service 3/6] RUN sed -i ‘s/deb.debian.org/mirrors.aliyun.com/g’ /etc/apt/sources.list.d/debian.sources && apt-get updat 0.0s
=> CACHED [embedding-service 4/6] COPY requirements.txt . 0.0s
=> [embedding-service 5/6] RUN pip install --no-cache-dir -i https://mirrors.aliyun.com/pypi/simple/ -r requirements.txt 480.2s
=> [embedding-service 6/6] COPY app.py . 0.1s
=> [embedding-service] exporting to image 51.1s
=> => exporting layers 51.1s
=> => writing image sha256:6f215be4bd4923ef16cc87f77fc3cbfb0f0288395eb2145a6be1d86dd0ffde4b 0.0s
=> => naming to docker.io/library/embedding-service-embedding-service 0.0s
[+] Running 2/2
✔ Network embedding-service_default Created 0.1s
✔ Container gte-embedding-service Started 0.6s

查看运行情况

docker compose logs -f

[root@manager168 embedding-service]# docker compose logs -f
WARN[0000] /mnt/sdb3/embedding-service/docker-compose.yml: version is obsolete
gte-embedding-service | INFO: Started server process [1]
gte-embedding-service | INFO: Waiting for application startup.
gte-embedding-service | 2025-11-14 08:07:56,706 - app - INFO - 开始加载模型: Alibaba-NLP/gte-multilingual-base
gte-embedding-service | 2025-11-14 08:07:56,706 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: Alibaba-NLP/gte-multilingual-base
gte-embedding-service | /usr/local/lib/python3.10/site-packages/huggingface_hub/file_download.py:942: FutureWarning: resume_download is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use force_download=True.
gte-embedding-service | warnings.warn(
gte-embedding-service | A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
gte-embedding-service | - configuration.py
gte-embedding-service | . Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
gte-embedding-service | A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
gte-embedding-service | - modeling.py
gte-embedding-service | . Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
gte-embedding-service | Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: [‘classifier.bias’, ‘classifier.weight’]
gte-embedding-service | - This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
gte-embedding-service | - This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
gte-embedding-service | 2025-11-14 08:10:34,530 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: cpu
gte-embedding-service | 2025-11-14 08:10:34,534 - app - INFO - 使用 CPU 推理
gte-embedding-service | 2025-11-14 08:10:34,550 - app - INFO - 模型加载成功！耗时: 157.84秒
gte-embedding-service | 2025-11-14 08:10:34,550 - app - INFO - 向量维度: 768
gte-embedding-service | INFO: Application startup complete.
gte-embedding-service | INFO: Uvicorn running on http://0.0.0.0:5000 (Press CTRL+C to quit)
gte-embedding-service | INFO: 127.0.0.1:35030 - “GET /health HTTP/1.1” 200 OK
gte-embedding-service | INFO: 127.0.0.1:35582 - “GET /health HTTP/1.1” 200 OK
gte-embedding-service | INFO: 127.0.0.1:36112 - “GET /health HTTP/1.1” 200 OK
gte-embedding-service | INFO: 127.0.0.1:36624 - “GET /health HTTP/1.1” 200 OK
gte-embedding-service | INFO: 127.0.0.1:36638 - “GET /health HTTP/1.1” 200 OK
gte-embedding-service | INFO: 127.0.0.1:36646 - “GET /health HTTP/1.1” 200 OK
gte-embedding-service | INFO: 127.0.0.1:36656 - “GET /health HTTP/1.1” 200 OK
gte-embedding-service | INFO: 127.0.0.1:36664 - “GET /health HTTP/1.1” 200 OK
gte-embedding-service | INFO: 127.0.0.1:36676 - “GET /health HTTP/1.1” 200 OK
gte-embedding-service | INFO: 127.0.0.1:36688 - “GET /health HTTP/1.1” 200 OK
gte-embedding-service | INFO: 127.0.0.1:36700 - “GET /health HTTP/1.1” 200 OK
gte-embedding-service | INFO: 127.0.0.1:36708 - “GET /health HTTP/1.1” 200 OK
gte-embedding-service | INFO: 127.0.0.1:36718 - “GET /health HTTP/1.1” 200 OK
gte-embedding-service | INFO: 127.0.0.1:36728 - “GET /health HTTP/1.1” 200 O

五、API 使用示例

1. 单文本向量化

curl -X POST http://localhost:5000/embed \
  -H "Content-Type: application/json" \
  -d '{"text": "这是一段测试文本"}'

{
  "embedding": [...],
  "dimension": 768
}

2. 批量向量化

curl -X POST http://localhost:5000/embed_batch \
  -H "Content-Type: application/json" \
  -d '{"texts": ["文本1", "文本2", "文本3"]}'

{
  "embeddings": [...],
  "dimension": 768,
  "count": 3
}

六、Spring AI 项目集成

6.1 项目配置

在 Spring Boot 项目的 application.yml 中添加向量服务配置：

# 自定义配置
app:
  elasticsearch:
    # 向量维度（gte-multilingual-base 模型）
    vector-dimension: 768
    # 批量处理大小
    batch-size: 100
    # 本地 Embedding 服务 URL
    embedding-service-url: http://192.168.0.168:5000

6.2 配置类实现

创建配置类 AppConfig.java 用于读取配置和初始化 RestTemplate：

package com.zhouquan.ai.config;

import lombok.Data;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.boot.web.client.RestTemplateBuilder;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.http.client.BufferingClientHttpRequestFactory;
import org.springframework.http.client.SimpleClientHttpRequestFactory;
import org.springframework.web.client.RestTemplate;

/**
 * 应用自定义配置
 */
@Data
@Configuration
@ConfigurationProperties(prefix = "app.elasticsearch")
public class AppConfig {

    /**
     * 向量维度
     */
    private Integer vectorDimension;

    /**
     * 批量处理大小
     */
    private Integer batchSize;

    /**
     * Embedding 服务 URL
     */
    private String embeddingServiceUrl;

    @Bean
    public RestTemplate restTemplate(RestTemplateBuilder builder) {
        // 配置请求工厂
        SimpleClientHttpRequestFactory requestFactory = new SimpleClientHttpRequestFactory();
        // 设置连接超时时间（30秒）
        requestFactory.setConnectTimeout(30000);
        // 设置读取超时时间（5分钟，因为向量化可能需要较长时间）
        requestFactory.setReadTimeout(300000);
        
        // 使用 BufferingClientHttpRequestFactory 来支持请求体的多次读取
        BufferingClientHttpRequestFactory bufferingFactory = 
                new BufferingClientHttpRequestFactory(requestFactory);
        
        return builder
                .requestFactory(() -> bufferingFactory)
                .build();
    }
}

6.3 EmbeddingService 实现

创建 EmbeddingService.java 封装向量化服务调用：

package com.zhouquan.ai.service;

import com.zhouquan.ai.config.AppConfig;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.http.HttpEntity;
import org.springframework.http.HttpHeaders;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Service;
import org.springframework.web.client.RestTemplate;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Embedding 向量化服务
 * 使用本地部署的 gte-multilingual-base 模型生成文本向量
 */
@Slf4j
@Service
@RequiredArgsConstructor
public class EmbeddingService {

    private final RestTemplate restTemplate;
    private final AppConfig appConfig;

    /**
     * 将单个文本转换为向量
     */
    public float[] embedText(String text) {
        if (text == null || text.trim().isEmpty()) {
            log.warn("文本为空，返回空向量");
            return new float[0];
        }

        try {
            List<float[]> vectors = embedTexts(List.of(text));
            if (vectors.isEmpty()) {
                log.warn("向量化结果为空");
                return new float[0];
            }
            return vectors.get(0);
        } catch (Exception e) {
            log.error("文本向量化失败: {}", text.substring(0, Math.min(100, text.length())), e);
            throw new RuntimeException("文本向量化失败", e);
        }
    }

    /**
     * 批量将文本转换为向量
     * 支持自动重试机制，提高服务稳定性
     */
    public List<float[]> embedTexts(List<String> texts) {
        if (texts == null || texts.isEmpty()) {
            return new ArrayList<>();
        }

        int maxRetries = 3;
        int retryDelay = 1000; // 1秒
        
        for (int attempt = 1; attempt <= maxRetries; attempt++) {
            try {
                String url = appConfig.getEmbeddingServiceUrl() + "/embed_batch";
                log.info("调用本地 embedding 服务: {}, 文本数量: {}, 尝试次数: {}/{}", 
                        url, texts.size(), attempt, maxRetries);

                // 计算请求大小
                long totalChars = texts.stream().mapToLong(String::length).sum();
                log.info("请求文本总字符数: {}", totalChars);

                // 构建请求体
                Map<String, Object> requestBody = new HashMap<>();
                requestBody.put("texts", texts);

                // 设置请求头
                HttpHeaders headers = new HttpHeaders();
                headers.setContentType(MediaType.APPLICATION_JSON);
                headers.set("Connection", "keep-alive");
                HttpEntity<Map<String, Object>> request = new HttpEntity<>(requestBody, headers);

                // 发送 POST 请求
                ResponseEntity<EmbedBatchResponse> response = restTemplate.postForEntity(
                        url,
                        request,
                        EmbedBatchResponse.class
                );

                EmbedBatchResponse body = response.getBody();
                if (body == null || body.getEmbeddings() == null || body.getEmbeddings().isEmpty()) {
                    log.error("向量化服务返回结果为空");
                    throw new RuntimeException("向量化服务返回结果为空");
                }

                log.info("向量化成功，返回 {} 个向量，维度: {}", body.getCount(), body.getDimension());

                // 将 List<List<Double>> 转换为 List<float[]>
                List<float[]> result = new ArrayList<>();
                for (List<Double> embedding : body.getEmbeddings()) {
                    float[] floatArray = new float[embedding.size()];
                    for (int i = 0; i < embedding.size(); i++) {
                        floatArray[i] = embedding.get(i).floatValue();
                    }
                    result.add(floatArray);
                }

                return result;

            } catch (org.springframework.web.client.ResourceAccessException e) {
                if (attempt == maxRetries) {
                    log.error("批量文本向量化失败，已达最大重试次数 {}", maxRetries, e);
                    throw new RuntimeException("批量文本向量化失败（连接错误）: " + e.getMessage() + 
                            "。请检查向量化服务是否正常运行: " + appConfig.getEmbeddingServiceUrl(), e);
                }
                
                log.warn("第 {} 次尝试失败（连接错误），{}ms 后重试: {}", 
                        attempt, retryDelay, e.getMessage());
                
                try {
                    Thread.sleep(retryDelay);
                    retryDelay *= 2; // 指数退避
                } catch (InterruptedException ie) {
                    Thread.currentThread().interrupt();
                    throw new RuntimeException("重试被中断", ie);
                }
                
            } catch (Exception e) {
                log.error("批量文本向量化失败，文本数量: {}", texts.size(), e);
                throw new RuntimeException("批量文本向量化失败: " + e.getMessage(), e);
            }
        }
        
        throw new RuntimeException("批量文本向量化失败：未知错误");
    }

    /**
     * 计算向量维度
     */
    public int getVectorDimension(String sampleText) {
        float[] vector = embedText(sampleText);
        return vector.length;
    }

    /**
     * 批量响应数据结构
     */
    @lombok.Data
    private static class EmbedBatchResponse {
        private List<List<Double>> embeddings;
        private Integer dimension;
        private Integer count;
    }
}

6.4 在 RAG 服务中使用

在 RagService.java 中使用 EmbeddingService：

@Service
@RequiredArgsConstructor
public class RagService {

    private final EmbeddingService embeddingService;
    // ... 其他依赖

    public RagResponse ask(String question, List<QaHistory> historyContext) {
        // 1. 将问题向量化
        float[] questionVector = embeddingService.embedText(question);
        
        // 2. 从向量库检索相似文档
        List<RetrievedDocument> retrievedDocs = searchSimilarDocuments(questionVector);
        
        // 3. 构建增强提示词并生成答案
        // ...
    }
}

6.5 批量向量化示例

在 DocumentVectorizationService.java 中批量处理文档：

@Service
@RequiredArgsConstructor
public class DocumentVectorizationService {

    private final EmbeddingService embeddingService;
    
    public void vectorizeDocuments(int startIndex, int size) {
        // 1. 从数据源读取文档
        List<Document> documents = loadDocuments(startIndex, size);
        
        // 2. 提取文本内容
        List<String> texts = documents.stream()
            .map(Document::getContent)
            .collect(Collectors.toList());
        
        // 3. 批量向量化（利用批量接口提高效率）
        List<float[]> vectors = embeddingService.embedTexts(texts);
        
        // 4. 存储向量到 Elasticsearch
        saveVectorsToElasticsearch(documents, vectors);
    }
}