Prepare vectorizes

This commit is contained in:
2025-08-11 16:42:36 +02:00
parent f077c6351d
commit 0c6d008368
5 changed files with 79 additions and 38 deletions

View File

@@ -1,4 +1,4 @@
# app/agents/__init__.py
from .classification_agent import agent_classify_document_from_image from .classification_agent import agent_classify_document_from_image
from .receipt_agent import agent_extract_receipt_info from .receipt_agent import agent_extract_receipt_info
from .invoice_agent import agent_extract_invoice_info from .invoice_agent import agent_extract_invoice_info
from .vectorization_agent import agent_vectorize_and_store

View File

@@ -1,38 +1,43 @@
# app/agents/vectorization_agent.py # app/agents/vectorization_agent.py
from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.text_splitter import RecursiveCharacterTextSplitter
from ..core.vector_store import vector_store, embedding_model from langchain_openai import OpenAIEmbeddings
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
import chromadb
client = chromadb.PersistentClient(path="./chroma_db")
vector_store = client.get_or_create_collection(name="documents")
# Initialize the text splitter to divide long documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter( text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500, chunk_size=1000,
chunk_overlap=50, chunk_overlap=100,
) )
def agent_vectorize_and_store(doc_id: str, text: str, category: str): def agent_vectorize_and_store(doc_id: str, text: str, category: str, language: str):
"""Agent 4: Vectorization and Storage (Real Implementation)""" """
print(f"--- [Agent 4] Vectorizing document (ID: {doc_id})...") Agent 4: Vectorizes a document and stores it in ChromaDB.
"""
print(f"--- [Background Task] Starting vectorization (ID: {doc_id})...")
# 1. Split the document text into chunks try:
chunks = text_splitter.split_text(text)
print(f"--- [Agent 4] Document split into {len(chunks)} chunks.")
if not chunks:
print(f"--- [Agent 4] Document is empty, skipping vectorization.")
return return
# 2. Create a unique ID and metadata for each chunk chunks = text_splitter.split_text(text)
chunk_ids = [f"{doc_id}_{i}" for i in range(len(chunks))] if not chunks:
metadatas = [{"doc_id": doc_id, "category": category, "chunk_number": i} for i in range(len(chunks))] print(f"--- [Background Task] document {doc_id} has no text to vectorize.")
return
# 3. Use an embedding model to generate vectors for all chunks chunk_ids = [f"{doc_id}_{i}" for i in range(len(chunks))]
embeddings = embedding_model.embed_documents(chunks) metadatas = [{"doc_id": doc_id, "category": category, "language": language, "chunk_number": i} for i in
range(len(chunks))]
# 4. Add the IDs, vectors, metadata, and text chunks to ChromaDB embeddings = embedding_model.embed_documents(chunks)
vector_store.add(
ids=chunk_ids,
embeddings=embeddings,
documents=chunks,
metadatas=metadatas
)
print(f"--- [Agent 4] document {doc_id} stored in ChromaDB。") vector_store.add(
ids=chunk_ids,
embeddings=embeddings,
documents=chunks,
metadatas=metadatas
)
print(f"--- [Background Task] Document {doc_id} vectorized and stored successfully.")
except Exception as e:
print(f"--- [background Task] Vectorization failed (ID: {doc_id}): {e}")

23
app/core/ocr.py Normal file
View File

@@ -0,0 +1,23 @@
import pytesseract
from PIL import Image
from typing import List
def extract_text_from_images(images: List[Image.Image]) -> str:
"""
使用Tesseract OCR从一系列图片中提取并合并所有文本。
"""
print("--- [Core OCR] 正在从图片中提取文本用于向量化...")
full_text = []
for img in images:
try:
# lang='chi_sim+eng' 表示同时识别简体中文和英文
text = pytesseract.image_to_string(img, lang='chi_sim+eng')
full_text.append(text)
except Exception as e:
print(f"--- [Core OCR] 单页处理失败: {e}")
continue
combined_text = "\n\n--- Page Break ---\n\n".join(full_text)
print("--- [Core OCR] 文本提取成功。")
return combined_text

View File

@@ -1,8 +1,6 @@
# app/routers/documents.py
import uuid import uuid
import mimetypes import mimetypes
import base64 from fastapi import APIRouter, UploadFile, File, HTTPException, BackgroundTasks
from fastapi import APIRouter, UploadFile, File, HTTPException
from typing import Dict, Any, List from typing import Dict, Any, List
from fastapi.concurrency import run_in_threadpool from fastapi.concurrency import run_in_threadpool
from PIL import Image from PIL import Image
@@ -10,6 +8,7 @@ from io import BytesIO
from .. import agents from .. import agents
from ..core.pdf_processor import convert_pdf_to_images, image_to_base64_str from ..core.pdf_processor import convert_pdf_to_images, image_to_base64_str
from ..core.ocr import extract_text_from_images
# Create an APIRouter instance # Create an APIRouter instance
router = APIRouter( router = APIRouter(
@@ -46,8 +45,12 @@ async def multimodal_process_pipeline(doc_id: str, image: Image.Image, page_num:
db_results[final_result["doc_id"]] = final_result db_results[final_result["doc_id"]] = final_result
return final_result return final_result
@router.post("/process", summary="upload and process a document")
async def upload_and_process_document(file: UploadFile = File(...)): @router.post("/process", summary="Upload and Process Document")
async def upload_and_process_document(
file: UploadFile = File(...),
background_tasks: BackgroundTasks = BackgroundTasks()
):
if not file.filename: if not file.filename:
raise HTTPException(status_code=400, detail="No file provided.") raise HTTPException(status_code=400, detail="No file provided.")
@@ -57,7 +60,7 @@ async def upload_and_process_document(file: UploadFile = File(...)):
try: try:
file_type = mimetypes.guess_type(file.filename)[0] file_type = mimetypes.guess_type(file.filename)[0]
print(f"File type: {file_type}") print(f"Detected file type: {file_type}")
images: List[Image.Image] = [] images: List[Image.Image] = []
if file_type == 'application/pdf': if file_type == 'application/pdf':
@@ -84,18 +87,28 @@ async def upload_and_process_document(file: UploadFile = File(...)):
elif category == "INVOICE": elif category == "INVOICE":
extraction_result = await agents.agent_extract_invoice_info(images_base64, language) extraction_result = await agents.agent_extract_invoice_info(images_base64, language)
else: else:
print(f"The document is classified as '{category}'skipping extraction。") print(f"Document classified as '{category}'skipping extraction。")
# 3. Return a unified result
final_result = { final_result = {
"doc_id": doc_id, "doc_id": doc_id,
"message": "Document processing initiated. Vectorization is running in the background.",
"page_count": len(images), "page_count": len(images),
"category": category, "category": category,
"language": language, "language": language,
"extraction_data": extraction_result.dict() if extraction_result else None, "extraction_data": extraction_result.dict() if extraction_result else None,
"status": "Processed" "status": "Processing"
} }
db_results[doc_id] = final_result
full_text = await run_in_threadpool(extract_text_from_images, images)
background_tasks.add_task(
agents.agent_vectorize_and_store,
doc_id,
full_text,
category,
language
)
print("--- [Main] Vectorization job added to background tasks.")
return final_result return final_result
except Exception as e: except Exception as e:

BIN
chroma_db/chroma.sqlite3 Normal file

Binary file not shown.