1 ollama install
2 ollama pull&run llama3.2:1b-instruct
3 vectorstores (RecursiveCharacterTextSplitter/PromptTemplate)
4 craete api
处理成模型可以快速查询的格式
接收用户问题,从向量库检索相关知识,然后让LLM生成答案

#!/usr/bin/env python3
# ingest_pdfs.py 

import os
import warnings
warnings.filterwarnings('ignore')

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
import requests

def check_ollama():
    """检查Ollama服务是否运行"""
    try:
        response = requests.get("http://localhost:11434/api/tags", timeout=5)
        return response.status_code == 200
    except:
        return False

# 选择嵌入方案
if check_ollama():
    print("使用Ollama本地嵌入...")
    from langchain_community.embeddings import OllamaEmbeddings
    embedding_model = OllamaEmbeddings(
        model="deepseek-r1:1.5b",  # 推荐用于嵌入
        base_url="http://localhost:11434"
    )
else:
    print("使用HuggingFace嵌入...")
    from langchain_community.embeddings import HuggingFaceEmbeddings
    embedding_model = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={'device': 'cpu'},
        encode_kwargs={'normalize_embeddings': False}
    )

def process_pdfs():
    """处理PDF文档"""
    
    # 您的PDF文件路径
    pdf_paths = [
        "/home/michael/Downloads/datagroup.pdf",
        "/home/michael/Downloads/data.pdf"
    ]
    
    # 检查文件是否存在
    for path in pdf_paths:
        if not os.path.exists(path):
            print(f"❌ 文件不存在: {path}")
            return None
        else:
            print(f"✅ 找到文件: {path}")
    
    # 1. 加载PDF文档
    print("\n📚 正在加载PDF文档...")
    documents = []
    
    for pdf_path in pdf_paths:
        try:
            loader = PyPDFLoader(pdf_path)
            docs = loader.load()
            
            # 为每个文档添加元数据
            for doc in docs:
                doc.metadata['source'] = pdf_path
                doc.metadata['filename'] = os.path.basename(pdf_path)
            
            documents.extend(docs)
            print(f"  已加载: {os.path.basename(pdf_path)} - {len(docs)}页")
            
        except Exception as e:
            print(f"❌ 加载失败 {pdf_path}: {e}")
            continue
    
    if not documents:
        print("❌ 没有成功加载任何文档")
        return None
    
    print(f"📄 总共加载: {len(documents)}页文档")
    
    # 2. 分割文本
    print("\n✂️ 正在分割文本...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        separators=["\n\n", "\n", "。", "!", "?", ";", ",", "、", " "]
    )
    
    chunks = text_splitter.split_documents(documents)
    print(f"📊 分割为: {len(chunks)}个文本块")
    
    # 显示一些样本
    print("\n样本文本块:")
    for i in range(min(2, len(chunks))):
        print(f"  块 {i+1}: {chunks[i].page_content[:100]}...")
    
    # 3. 创建向量数据库
    print("\n🔄 正在创建向量数据库...")
    try:
        vector_store = Chroma.from_documents(
            documents=chunks,
            embedding=embedding_model,
            persist_directory="./data_vector_db"
        )
        
        print("✅ 向量数据库创建成功!")
        print(f"💾 保存路径: ./data_vector_db")
        
        return vector_store
        
    except Exception as e:
        print(f"❌ 创建向量数据库失败: {e}")
        return None

if __name__ == "__main__":
    print("🚀 开始处理文档...")
    vector_db = process_pdfs()
    
    if vector_db:
        print("\n🎯 处理完成!您现在可以运行查询脚本进行问答。")
    else:
        print("\n💥 处理失败,请检查以上错误信息。")
#!/usr/bin/env python3
# rag_deepseek_app.py - 

from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain.prompts import PromptTemplate
import requests
import os

class DeepSeekRAG:
    def __init__(self):
        # 1. 加载向量数据库 - 使用Ollama嵌入
        self.embedding_model = OllamaEmbeddings(
            model="deepseek-r1:1.5b",
            base_url="http://localhost:11434"
        )
        
        # 加载之前创建的向量数据库
        self.vector_db = Chroma(
            persist_directory="./deepseek_vector_db",  # 确保这个路径匹配
            embedding_function=self.embedding_model
        )
        
        # 2. 定义提示词模板 - 针对DeepSeek优化
        self.prompt_template = PromptTemplate.from_template(
            """请你扮演名为"小助手"的智能客服。你的态度专业、准确。请严格根据提供的【上下文信息】来回答问题。如果上下文信息中没有答案,请直接说"根据现有资料,我暂时无法回答这个问题"。

【上下文信息】:
{context}

【用户问题】:
{question}

【小助手回答】:"""
        )

    def get_ai_response(self, user_question):
        """核心RAG函数"""
        try:
            # 3.1 从向量库检索相关文档片段
            relevant_docs = self.vector_db.similarity_search(user_question, k=3)
            context_text = "\n\n".join([doc.page_content for doc in relevant_docs])
            
            if not context_text.strip():
                return "根据现有资料,我暂时无法回答这个问题。", []

            # 3.2 组装最终的提示词
            final_prompt = self.prompt_template.format(
                context=context_text, 
                question=user_question
            )

            # 3.3 调用DeepSeek模型
            response = requests.post(
                'http://localhost:11434/api/generate',
                json={
                    'model': 'deepseek-r1:1.5b',
                    'prompt': final_prompt,
                    'stream': False,
                    'options': {
                        'temperature': 0.1,
                        'top_k': 20,
                        'top_p': 0.8,
                        'num_predict': 512
                    }
                },
                timeout=60
            )
            
            if response.status_code == 200:
                return response.json()['response'], relevant_docs
            else:
                return f"抱歉,AI服务暂时不可用。错误代码: {response.status_code}", []
                
        except requests.exceptions.Timeout:
            return "请求超时,请稍后重试。", []
        except Exception as e:
            return f"系统错误: {str(e)}", []

def main():
    print("🤖 文档智能问答系统 - DeepSeek-R1 1.5B")
    print("=" * 50)
    
    # 检查向量数据库是否存在
    if not os.path.exists("./deepseek_vector_db"):
        print("❌ 向量数据库不存在,请先运行 ingest_deepseek.py")
        return
    
    # 检查Ollama服务
    try:
        response = requests.get("http://localhost:11434/api/tags", timeout=5)
        if response.status_code != 200:
            print("❌ Ollama服务未运行")
            return
    except:
        print("❌ 无法连接Ollama服务,请确保Ollama正在运行")
        return
    
    # 初始化RAG系统
    rag_system = DeepSeekRAG()
    print("✅ 系统初始化完成")
    print("\n💡 提示: 您可以询问关于文档的任何问题")
    print("输入 'quit' 或 '退出' 结束程序\n")
    
    while True:
        try:
            user_question = input("💬 您的问题: ").strip()
            
            if user_question.lower() in ['quit', '退出', 'exit']:
                print("👋 再见!")
                break
                
            if not user_question:
                continue
                
            print("🔍 搜索中...")
            answer, sources = rag_system.get_ai_response(user_question)
            
            print(f"\n📝 回答: {answer}")
            
            # 显示来源信息
            if sources:
                print(f"\n📚 参考来源:")
                for i, doc in enumerate(sources, 1):
                    filename = os.path.basename(doc.metadata.get('source', '未知文档'))
                    page = doc.metadata.get('page', '未知页码')
                    print(f"  {i}. {filename} - 第{page}页")
                    
                    # 显示部分内容预览
                    preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
                    print(f"     内容: {preview}")
            
            print("\n" + "=" * 50)
            
        except KeyboardInterrupt:
            print("\n👋 用户中断,再见!")
            break
        except Exception as e:
            print(f"❌ 发生错误: {e}")

# 测试函数
def test_system():
    """测试系统功能"""
    print("🧪 系统测试...")
    
    rag_system = DeepSeekRAG()
    
    test_questions = [
        "你是谁?",
        "策略有哪些?",
        "风险管理的方法?"
    ]
    
    for question in test_questions:
        print(f"\n测试问题: {question}")
        answer, sources = rag_system.get_ai_response(question)
        print(f"回答: {answer}")
        print(f"参考文档数: {len(sources)}")

if __name__ == "__main__":
    # 可以选择运行测试或主程序
    import sys
    if len(sys.argv) > 1 and sys.argv[1] == "test":
        test_system()
    else:
        main()

国内安装

pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple langchain langchain-community sentence-transformers chromadb pypdf
 

Logo

有“AI”的1024 = 2048,欢迎大家加入2048 AI社区

更多推荐