5.9 KiB
5.9 KiB
LLM 应用开发指南 (RAG/Agent)
RAG 系统架构
用户查询 → 查询改写 → 向量检索 → 重排序 → LLM生成 → 回答
↓
向量数据库
↑
文档切分 → Embedding
文档处理
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, TextLoader
# 加载文档
loader = PyPDFLoader('document.pdf')
documents = loader.load()
# 文档切分
splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
separators=['\n\n', '\n', '。', '!', '?', ',', ' ']
)
chunks = splitter.split_documents(documents)
向量数据库
Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
# Embedding 模型
embeddings = HuggingFaceEmbeddings(
model_name='BAAI/bge-large-zh-v1.5',
model_kwargs={'device': 'cuda'},
encode_kwargs={'normalize_embeddings': True}
)
# 创建向量库
vectorstore = Chroma.from_documents(
documents=chunks,
embedding=embeddings,
persist_directory='./chroma_db'
)
# 检索
retriever = vectorstore.as_retriever(
search_type='similarity', # mmr
search_kwargs={'k': 5}
)
docs = retriever.get_relevant_documents('查询内容')
FAISS
from langchain.vectorstores import FAISS
vectorstore = FAISS.from_documents(chunks, embeddings)
vectorstore.save_local('faiss_index')
# 加载
vectorstore = FAISS.load_local('faiss_index', embeddings)
RAG 实现
基础 RAG
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
llm = OpenAI(temperature=0)
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type='stuff', # map_reduce, refine
retriever=retriever,
return_source_documents=True
)
result = qa_chain({'query': '问题'})
print(result['result'])
print(result['source_documents'])
自定义 Prompt
from langchain.prompts import PromptTemplate
template = """基于以下上下文回答问题。如果无法从上下文中找到答案,请说"我不知道"。
上下文:
{context}
问题: {question}
回答:"""
prompt = PromptTemplate(
template=template,
input_variables=['context', 'question']
)
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
retriever=retriever,
chain_type_kwargs={'prompt': prompt}
)
对话历史
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
memory_key='chat_history',
return_messages=True
)
qa_chain = ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=retriever,
memory=memory
)
result = qa_chain({'question': '问题'})
Agent 开发
ReAct Agent
from langchain.agents import initialize_agent, AgentType, Tool
from langchain.tools import DuckDuckGoSearchRun
# 定义工具
search = DuckDuckGoSearchRun()
tools = [
Tool(
name='Search',
func=search.run,
description='用于搜索互联网上的信息'
),
Tool(
name='Calculator',
func=lambda x: eval(x),
description='用于数学计算'
)
]
# 创建 Agent
agent = initialize_agent(
tools=tools,
llm=llm,
agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
verbose=True
)
result = agent.run('北京今天的天气怎么样?')
自定义工具
from langchain.tools import BaseTool
from pydantic import BaseModel, Field
class SearchInput(BaseModel):
query: str = Field(description='搜索查询')
class CustomSearchTool(BaseTool):
name = 'custom_search'
description = '自定义搜索工具'
args_schema = SearchInput
def _run(self, query: str) -> str:
# 实现搜索逻辑
return f'搜索结果: {query}'
async def _arun(self, query: str) -> str:
return self._run(query)
Prompt Engineering
结构化输出
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel
class ExtractedInfo(BaseModel):
name: str
age: int
occupation: str
parser = PydanticOutputParser(pydantic_object=ExtractedInfo)
prompt = PromptTemplate(
template='从以下文本中提取信息:\n{text}\n\n{format_instructions}',
input_variables=['text'],
partial_variables={'format_instructions': parser.get_format_instructions()}
)
chain = prompt | llm | parser
result = chain.invoke({'text': '张三,30岁,软件工程师'})
Few-shot Learning
from langchain.prompts import FewShotPromptTemplate
examples = [
{'input': '今天天气真好', 'output': '正面'},
{'input': '服务太差了', 'output': '负面'},
{'input': '还行吧', 'output': '中性'},
]
example_prompt = PromptTemplate(
input_variables=['input', 'output'],
template='输入: {input}\n输出: {output}'
)
few_shot_prompt = FewShotPromptTemplate(
examples=examples,
example_prompt=example_prompt,
prefix='对以下文本进行情感分类:\n',
suffix='\n输入: {input}\n输出:',
input_variables=['input']
)
vLLM 部署
from vllm import LLM, SamplingParams
# 加载模型
llm = LLM(
model='Qwen/Qwen2-7B-Instruct',
tensor_parallel_size=1,
dtype='float16'
)
# 采样参数
sampling_params = SamplingParams(
temperature=0.7,
top_p=0.9,
max_tokens=512
)
# 批量推理
prompts = ['问题1', '问题2']
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
print(output.outputs[0].text)
API 服务
from fastapi import FastAPI
from pydantic import BaseModel
app = FastAPI()
class Query(BaseModel):
question: str
@app.post('/chat')
async def chat(query: Query):
result = qa_chain({'query': query.question})
return {'answer': result['result']}
# 运行: uvicorn main:app --host 0.0.0.0 --port 8000