From 87ba009bd744fcd7bb2071548d69a1f889221a49 Mon Sep 17 00:00:00 2001 From: Yaojia Wang Date: Mon, 11 Aug 2025 21:38:25 +0200 Subject: [PATCH] Vector. --- app/agents/vectorization_agent.py | 26 +++++++++++--------------- app/core/vector_store.py | 23 ++--------------------- app/routers/documents.py | 14 ++++++++------ chroma_db/chroma.sqlite3 | Bin 163840 -> 163840 bytes 4 files changed, 21 insertions(+), 42 deletions(-) diff --git a/app/agents/vectorization_agent.py b/app/agents/vectorization_agent.py index 2677a75..08f1ee6 100644 --- a/app/agents/vectorization_agent.py +++ b/app/agents/vectorization_agent.py @@ -1,29 +1,25 @@ # app/agents/vectorization_agent.py from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain_openai import OpenAIEmbeddings -embedding_model = OpenAIEmbeddings(model="text-embedding-3-small") -import chromadb - -client = chromadb.PersistentClient(path="./chroma_db") -vector_store = client.get_or_create_collection(name="documents") text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=100, ) -def agent_vectorize_and_store(doc_id: str, text: str, category: str, language: str): - """ - Agent 4: Vectorizes a document and stores it in ChromaDB. - """ +def agent_vectorize_and_store( + doc_id: str, + text: str, + category: str, + language: str, + embedding_model, + vector_store +): print(f"--- [Background Task] Starting vectorization (ID: {doc_id})...") try: - return - chunks = text_splitter.split_text(text) if not chunks: - print(f"--- [Background Task] document {doc_id} has no text to vectorize.") + print(f"--- [Background task] document is empty, skip vectorization. (ID: {doc_id})") return chunk_ids = [f"{doc_id}_{i}" for i in range(len(chunks))] @@ -38,6 +34,6 @@ def agent_vectorize_and_store(doc_id: str, text: str, category: str, language: s documents=chunks, metadatas=metadatas ) - print(f"--- [Background Task] Document {doc_id} vectorized and stored successfully.") + print(f"--- [Background Task] Document {doc_id} vectorized。") except Exception as e: - print(f"--- [background Task] Vectorization failed (ID: {doc_id}): {e}") + print(f"--- [Background Task] Document vectorization failed (ID: {doc_id}): {e}") diff --git a/app/core/vector_store.py b/app/core/vector_store.py index f0ecd3a..a9f1402 100644 --- a/app/core/vector_store.py +++ b/app/core/vector_store.py @@ -1,47 +1,28 @@ -# app/core/vector_store.py import os import chromadb from dotenv import load_dotenv from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings load_dotenv() - LLM_PROVIDER = os.getenv("LLM_PROVIDER", "openai").lower() - embedding_model = None print(f"--- [Core] Initializing Embeddings with provider: {LLM_PROVIDER} ---") if LLM_PROVIDER == "azure": - required_vars = [ - "AZURE_OPENAI_ENDPOINT", "AZURE_OPENAI_API_KEY", - "OPENAI_API_VERSION", "AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME" - ] - if not all(os.getenv(var) for var in required_vars): - raise ValueError("One or more Azure OpenAI environment variables for embeddings are not set.") - embedding_model = AzureOpenAIEmbeddings( azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"), api_key=os.getenv("AZURE_OPENAI_API_KEY"), api_version=os.getenv("OPENAI_API_VERSION"), azure_deployment=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME"), ) - elif LLM_PROVIDER == "openai": - if not os.getenv("OPENAI_API_KEY"): - raise ValueError("OPENAI_API_KEY is not set for the 'openai' provider.") - embedding_model = OpenAIEmbeddings( api_key=os.getenv("OPENAI_API_KEY"), model=os.getenv("OPENAI_EMBEDDING_MODEL_NAME", "text-embedding-3-small") ) - else: - raise ValueError(f"Unsupported LLM_PROVIDER: {LLM_PROVIDER}. Please use 'azure' or 'openai'.") - + raise ValueError(f"Unsupported LLM_PROVIDER: {LLM_PROVIDER}.") client = chromadb.PersistentClient(path="./chroma_db") -vector_store = client.get_or_create_collection( - name="documents", - metadata={"hnsw:space": "cosine"} -) \ No newline at end of file +vector_store = client.get_or_create_collection(name="documents") \ No newline at end of file diff --git a/app/routers/documents.py b/app/routers/documents.py index 221cdb3..208173d 100644 --- a/app/routers/documents.py +++ b/app/routers/documents.py @@ -5,10 +5,10 @@ from typing import Dict, Any, List from fastapi.concurrency import run_in_threadpool from PIL import Image from io import BytesIO - from .. import agents from ..core.pdf_processor import convert_pdf_to_images, image_to_base64_str from ..core.ocr import extract_text_from_images +from ..core.vector_store import embedding_model, vector_store # Create an APIRouter instance router = APIRouter( @@ -102,10 +102,12 @@ async def upload_and_process_document( full_text = await run_in_threadpool(extract_text_from_images, images) background_tasks.add_task( agents.agent_vectorize_and_store, - doc_id, - full_text, - category, - language + doc_id=doc_id, + text=full_text, + category=category, + language=language, + embedding_model=embedding_model, + vector_store=vector_store ) print("--- [Main] Vectorization job added to background tasks.") @@ -118,4 +120,4 @@ async def upload_and_process_document( async def get_result(doc_id: str): if doc_id in db_results: return db_results[doc_id] - raise HTTPException(status_code=404, detail="Document not found.") + raise HTTPException(status_code=404, detail="Document not found.") \ No newline at end of file diff --git a/chroma_db/chroma.sqlite3 b/chroma_db/chroma.sqlite3 index 1794c6354808328d373db81d6094ac1868427ecd..dfcb8493fe1aaf1c9046017aa7e8b47c842a5797 100644 GIT binary patch delta 109 zcmZo@;A&{#njp<6G*QNxQD|er5`8&d{{0O6zxluMf8>9||D693|6TqY{FnL9@t@>B z!oPpBqQP8#4Q5_u22N%k5Wx*1xIhFah~NMb>>z>-NHjmUZ+~vjxc#|3lcWOxt7#hA delta 52 zcmV-40L%Y?fC_+s3XmHC3XvQ`0Sd8Tq%RHy59a_6`w#XH@ek||=d%$&)enQ`KZoZ( K0k`Kr0vSNme-kPI