diff --git a/app/agents/__init__.py b/app/agents/__init__.py index 49b8c66..9c99e07 100644 --- a/app/agents/__init__.py +++ b/app/agents/__init__.py @@ -1,4 +1,4 @@ -# app/agents/__init__.py from .classification_agent import agent_classify_document_from_image from .receipt_agent import agent_extract_receipt_info -from .invoice_agent import agent_extract_invoice_info \ No newline at end of file +from .invoice_agent import agent_extract_invoice_info +from .vectorization_agent import agent_vectorize_and_store \ No newline at end of file diff --git a/app/agents/vectorization_agent.py b/app/agents/vectorization_agent.py index 983e568..2677a75 100644 --- a/app/agents/vectorization_agent.py +++ b/app/agents/vectorization_agent.py @@ -1,38 +1,43 @@ # app/agents/vectorization_agent.py from langchain.text_splitter import RecursiveCharacterTextSplitter -from ..core.vector_store import vector_store, embedding_model +from langchain_openai import OpenAIEmbeddings +embedding_model = OpenAIEmbeddings(model="text-embedding-3-small") +import chromadb + +client = chromadb.PersistentClient(path="./chroma_db") +vector_store = client.get_or_create_collection(name="documents") -# Initialize the text splitter to divide long documents into smaller chunks text_splitter = RecursiveCharacterTextSplitter( - chunk_size=500, - chunk_overlap=50, + chunk_size=1000, + chunk_overlap=100, ) -def agent_vectorize_and_store(doc_id: str, text: str, category: str): - """Agent 4: Vectorization and Storage (Real Implementation)""" - print(f"--- [Agent 4] Vectorizing document (ID: {doc_id})...") +def agent_vectorize_and_store(doc_id: str, text: str, category: str, language: str): + """ + Agent 4: Vectorizes a document and stores it in ChromaDB. + """ + print(f"--- [Background Task] Starting vectorization (ID: {doc_id})...") - # 1. Split the document text into chunks - chunks = text_splitter.split_text(text) - print(f"--- [Agent 4] Document split into {len(chunks)} chunks.") - - if not chunks: - print(f"--- [Agent 4] Document is empty, skipping vectorization.") + try: return - # 2. Create a unique ID and metadata for each chunk - chunk_ids = [f"{doc_id}_{i}" for i in range(len(chunks))] - metadatas = [{"doc_id": doc_id, "category": category, "chunk_number": i} for i in range(len(chunks))] + chunks = text_splitter.split_text(text) + if not chunks: + print(f"--- [Background Task] document {doc_id} has no text to vectorize.") + return - # 3. Use an embedding model to generate vectors for all chunks - embeddings = embedding_model.embed_documents(chunks) + chunk_ids = [f"{doc_id}_{i}" for i in range(len(chunks))] + metadatas = [{"doc_id": doc_id, "category": category, "language": language, "chunk_number": i} for i in + range(len(chunks))] - # 4. Add the IDs, vectors, metadata, and text chunks to ChromaDB - vector_store.add( - ids=chunk_ids, - embeddings=embeddings, - documents=chunks, - metadatas=metadatas - ) + embeddings = embedding_model.embed_documents(chunks) - print(f"--- [Agent 4] document {doc_id} stored in ChromaDB。") + vector_store.add( + ids=chunk_ids, + embeddings=embeddings, + documents=chunks, + metadatas=metadatas + ) + print(f"--- [Background Task] Document {doc_id} vectorized and stored successfully.") + except Exception as e: + print(f"--- [background Task] Vectorization failed (ID: {doc_id}): {e}") diff --git a/app/core/ocr.py b/app/core/ocr.py new file mode 100644 index 0000000..1e42327 --- /dev/null +++ b/app/core/ocr.py @@ -0,0 +1,23 @@ +import pytesseract +from PIL import Image +from typing import List + + +def extract_text_from_images(images: List[Image.Image]) -> str: + """ + 使用Tesseract OCR从一系列图片中提取并合并所有文本。 + """ + print("--- [Core OCR] 正在从图片中提取文本用于向量化...") + full_text = [] + for img in images: + try: + # lang='chi_sim+eng' 表示同时识别简体中文和英文 + text = pytesseract.image_to_string(img, lang='chi_sim+eng') + full_text.append(text) + except Exception as e: + print(f"--- [Core OCR] 单页处理失败: {e}") + continue + + combined_text = "\n\n--- Page Break ---\n\n".join(full_text) + print("--- [Core OCR] 文本提取成功。") + return combined_text diff --git a/app/routers/documents.py b/app/routers/documents.py index 8d3e9ec..221cdb3 100644 --- a/app/routers/documents.py +++ b/app/routers/documents.py @@ -1,8 +1,6 @@ -# app/routers/documents.py import uuid import mimetypes -import base64 -from fastapi import APIRouter, UploadFile, File, HTTPException +from fastapi import APIRouter, UploadFile, File, HTTPException, BackgroundTasks from typing import Dict, Any, List from fastapi.concurrency import run_in_threadpool from PIL import Image @@ -10,6 +8,7 @@ from io import BytesIO from .. import agents from ..core.pdf_processor import convert_pdf_to_images, image_to_base64_str +from ..core.ocr import extract_text_from_images # Create an APIRouter instance router = APIRouter( @@ -46,8 +45,12 @@ async def multimodal_process_pipeline(doc_id: str, image: Image.Image, page_num: db_results[final_result["doc_id"]] = final_result return final_result -@router.post("/process", summary="upload and process a document") -async def upload_and_process_document(file: UploadFile = File(...)): + +@router.post("/process", summary="Upload and Process Document") +async def upload_and_process_document( + file: UploadFile = File(...), + background_tasks: BackgroundTasks = BackgroundTasks() +): if not file.filename: raise HTTPException(status_code=400, detail="No file provided.") @@ -57,7 +60,7 @@ async def upload_and_process_document(file: UploadFile = File(...)): try: file_type = mimetypes.guess_type(file.filename)[0] - print(f"File type: {file_type}") + print(f"Detected file type: {file_type}") images: List[Image.Image] = [] if file_type == 'application/pdf': @@ -84,18 +87,28 @@ async def upload_and_process_document(file: UploadFile = File(...)): elif category == "INVOICE": extraction_result = await agents.agent_extract_invoice_info(images_base64, language) else: - print(f"The document is classified as '{category}',skipping extraction。") + print(f"Document classified as '{category}',skipping extraction。") - # 3. Return a unified result final_result = { "doc_id": doc_id, + "message": "Document processing initiated. Vectorization is running in the background.", "page_count": len(images), "category": category, "language": language, "extraction_data": extraction_result.dict() if extraction_result else None, - "status": "Processed" + "status": "Processing" } - db_results[doc_id] = final_result + + full_text = await run_in_threadpool(extract_text_from_images, images) + background_tasks.add_task( + agents.agent_vectorize_and_store, + doc_id, + full_text, + category, + language + ) + print("--- [Main] Vectorization job added to background tasks.") + return final_result except Exception as e: diff --git a/chroma_db/chroma.sqlite3 b/chroma_db/chroma.sqlite3 new file mode 100644 index 0000000..e43d25f Binary files /dev/null and b/chroma_db/chroma.sqlite3 differ