Prepare vectorizes

This commit is contained in:
2025-08-11 16:42:36 +02:00
parent f077c6351d
commit 0c6d008368
5 changed files with 79 additions and 38 deletions

View File

@@ -1,4 +1,4 @@
# app/agents/__init__.py
from .classification_agent import agent_classify_document_from_image
from .receipt_agent import agent_extract_receipt_info
from .invoice_agent import agent_extract_invoice_info
from .invoice_agent import agent_extract_invoice_info
from .vectorization_agent import agent_vectorize_and_store

View File

@@ -1,38 +1,43 @@
# app/agents/vectorization_agent.py
from langchain.text_splitter import RecursiveCharacterTextSplitter
from ..core.vector_store import vector_store, embedding_model
from langchain_openai import OpenAIEmbeddings
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
import chromadb
client = chromadb.PersistentClient(path="./chroma_db")
vector_store = client.get_or_create_collection(name="documents")
# Initialize the text splitter to divide long documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
chunk_size=1000,
chunk_overlap=100,
)
def agent_vectorize_and_store(doc_id: str, text: str, category: str):
"""Agent 4: Vectorization and Storage (Real Implementation)"""
print(f"--- [Agent 4] Vectorizing document (ID: {doc_id})...")
def agent_vectorize_and_store(doc_id: str, text: str, category: str, language: str):
"""
Agent 4: Vectorizes a document and stores it in ChromaDB.
"""
print(f"--- [Background Task] Starting vectorization (ID: {doc_id})...")
# 1. Split the document text into chunks
chunks = text_splitter.split_text(text)
print(f"--- [Agent 4] Document split into {len(chunks)} chunks.")
if not chunks:
print(f"--- [Agent 4] Document is empty, skipping vectorization.")
try:
return
# 2. Create a unique ID and metadata for each chunk
chunk_ids = [f"{doc_id}_{i}" for i in range(len(chunks))]
metadatas = [{"doc_id": doc_id, "category": category, "chunk_number": i} for i in range(len(chunks))]
chunks = text_splitter.split_text(text)
if not chunks:
print(f"--- [Background Task] document {doc_id} has no text to vectorize.")
return
# 3. Use an embedding model to generate vectors for all chunks
embeddings = embedding_model.embed_documents(chunks)
chunk_ids = [f"{doc_id}_{i}" for i in range(len(chunks))]
metadatas = [{"doc_id": doc_id, "category": category, "language": language, "chunk_number": i} for i in
range(len(chunks))]
# 4. Add the IDs, vectors, metadata, and text chunks to ChromaDB
vector_store.add(
ids=chunk_ids,
embeddings=embeddings,
documents=chunks,
metadatas=metadatas
)
embeddings = embedding_model.embed_documents(chunks)
print(f"--- [Agent 4] document {doc_id} stored in ChromaDB。")
vector_store.add(
ids=chunk_ids,
embeddings=embeddings,
documents=chunks,
metadatas=metadatas
)
print(f"--- [Background Task] Document {doc_id} vectorized and stored successfully.")
except Exception as e:
print(f"--- [background Task] Vectorization failed (ID: {doc_id}): {e}")

23
app/core/ocr.py Normal file
View File

@@ -0,0 +1,23 @@
import pytesseract
from PIL import Image
from typing import List
def extract_text_from_images(images: List[Image.Image]) -> str:
"""
使用Tesseract OCR从一系列图片中提取并合并所有文本。
"""
print("--- [Core OCR] 正在从图片中提取文本用于向量化...")
full_text = []
for img in images:
try:
# lang='chi_sim+eng' 表示同时识别简体中文和英文
text = pytesseract.image_to_string(img, lang='chi_sim+eng')
full_text.append(text)
except Exception as e:
print(f"--- [Core OCR] 单页处理失败: {e}")
continue
combined_text = "\n\n--- Page Break ---\n\n".join(full_text)
print("--- [Core OCR] 文本提取成功。")
return combined_text

View File

@@ -1,8 +1,6 @@
# app/routers/documents.py
import uuid
import mimetypes
import base64
from fastapi import APIRouter, UploadFile, File, HTTPException
from fastapi import APIRouter, UploadFile, File, HTTPException, BackgroundTasks
from typing import Dict, Any, List
from fastapi.concurrency import run_in_threadpool
from PIL import Image
@@ -10,6 +8,7 @@ from io import BytesIO
from .. import agents
from ..core.pdf_processor import convert_pdf_to_images, image_to_base64_str
from ..core.ocr import extract_text_from_images
# Create an APIRouter instance
router = APIRouter(
@@ -46,8 +45,12 @@ async def multimodal_process_pipeline(doc_id: str, image: Image.Image, page_num:
db_results[final_result["doc_id"]] = final_result
return final_result
@router.post("/process", summary="upload and process a document")
async def upload_and_process_document(file: UploadFile = File(...)):
@router.post("/process", summary="Upload and Process Document")
async def upload_and_process_document(
file: UploadFile = File(...),
background_tasks: BackgroundTasks = BackgroundTasks()
):
if not file.filename:
raise HTTPException(status_code=400, detail="No file provided.")
@@ -57,7 +60,7 @@ async def upload_and_process_document(file: UploadFile = File(...)):
try:
file_type = mimetypes.guess_type(file.filename)[0]
print(f"File type: {file_type}")
print(f"Detected file type: {file_type}")
images: List[Image.Image] = []
if file_type == 'application/pdf':
@@ -84,18 +87,28 @@ async def upload_and_process_document(file: UploadFile = File(...)):
elif category == "INVOICE":
extraction_result = await agents.agent_extract_invoice_info(images_base64, language)
else:
print(f"The document is classified as '{category}'skipping extraction。")
print(f"Document classified as '{category}'skipping extraction。")
# 3. Return a unified result
final_result = {
"doc_id": doc_id,
"message": "Document processing initiated. Vectorization is running in the background.",
"page_count": len(images),
"category": category,
"language": language,
"extraction_data": extraction_result.dict() if extraction_result else None,
"status": "Processed"
"status": "Processing"
}
db_results[doc_id] = final_result
full_text = await run_in_threadpool(extract_text_from_images, images)
background_tasks.add_task(
agents.agent_vectorize_and_store,
doc_id,
full_text,
category,
language
)
print("--- [Main] Vectorization job added to background tasks.")
return final_result
except Exception as e:

BIN
chroma_db/chroma.sqlite3 Normal file

Binary file not shown.