Prepare vectorizes

This commit is contained in:
2025-08-11 16:42:36 +02:00
parent f077c6351d
commit 0c6d008368
5 changed files with 79 additions and 38 deletions

View File

@@ -1,8 +1,6 @@
# app/routers/documents.py
import uuid
import mimetypes
import base64
from fastapi import APIRouter, UploadFile, File, HTTPException
from fastapi import APIRouter, UploadFile, File, HTTPException, BackgroundTasks
from typing import Dict, Any, List
from fastapi.concurrency import run_in_threadpool
from PIL import Image
@@ -10,6 +8,7 @@ from io import BytesIO
from .. import agents
from ..core.pdf_processor import convert_pdf_to_images, image_to_base64_str
from ..core.ocr import extract_text_from_images
# Create an APIRouter instance
router = APIRouter(
@@ -46,8 +45,12 @@ async def multimodal_process_pipeline(doc_id: str, image: Image.Image, page_num:
db_results[final_result["doc_id"]] = final_result
return final_result
@router.post("/process", summary="upload and process a document")
async def upload_and_process_document(file: UploadFile = File(...)):
@router.post("/process", summary="Upload and Process Document")
async def upload_and_process_document(
file: UploadFile = File(...),
background_tasks: BackgroundTasks = BackgroundTasks()
):
if not file.filename:
raise HTTPException(status_code=400, detail="No file provided.")
@@ -57,7 +60,7 @@ async def upload_and_process_document(file: UploadFile = File(...)):
try:
file_type = mimetypes.guess_type(file.filename)[0]
print(f"File type: {file_type}")
print(f"Detected file type: {file_type}")
images: List[Image.Image] = []
if file_type == 'application/pdf':
@@ -84,18 +87,28 @@ async def upload_and_process_document(file: UploadFile = File(...)):
elif category == "INVOICE":
extraction_result = await agents.agent_extract_invoice_info(images_base64, language)
else:
print(f"The document is classified as '{category}'skipping extraction。")
print(f"Document classified as '{category}'skipping extraction。")
# 3. Return a unified result
final_result = {
"doc_id": doc_id,
"message": "Document processing initiated. Vectorization is running in the background.",
"page_count": len(images),
"category": category,
"language": language,
"extraction_data": extraction_result.dict() if extraction_result else None,
"status": "Processed"
"status": "Processing"
}
db_results[doc_id] = final_result
full_text = await run_in_threadpool(extract_text_from_images, images)
background_tasks.add_task(
agents.agent_vectorize_and_store,
doc_id,
full_text,
category,
language
)
print("--- [Main] Vectorization job added to background tasks.")
return final_result
except Exception as e: