# app/agents/vectorization_agent.py from langchain.text_splitter import RecursiveCharacterTextSplitter from ..core.vector_store import vector_store, embedding_model # 初始化文本分割器,用于将长文档切成小块 text_splitter = RecursiveCharacterTextSplitter( chunk_size=500, # 每个块的大小(字符数) chunk_overlap=50, # 块之间的重叠部分 ) def agent_vectorize_and_store(doc_id: str, text: str, category: str): """Agent 4: 向量化并存储 (真实实现)""" print(f"--- [Agent 4] 正在向量化文档 (ID: {doc_id})...") # 1. 将文档文本分割成块 chunks = text_splitter.split_text(text) print(f"--- [Agent 4] 文档被切分为 {len(chunks)} 个块。") if not chunks: print(f"--- [Agent 4] 文档内容为空,跳过向量化。") return # 2. 为每个块创建唯一的ID和元数据 chunk_ids = [f"{doc_id}_{i}" for i in range(len(chunks))] metadatas = [{"doc_id": doc_id, "category": category, "chunk_number": i} for i in range(len(chunks))] # 3. 使用嵌入模型为所有块生成向量 embeddings = embedding_model.embed_documents(chunks) # 4. 将ID、向量、元数据和文本块本身添加到ChromaDB vector_store.add( ids=chunk_ids, embeddings=embeddings, documents=chunks, metadatas=metadatas ) print(f"--- [Agent 4] 文档 {doc_id} 的向量已存入ChromaDB。")