From 0c6d00836806d7aad2daf81aa1fa8777c9275e23 Mon Sep 17 00:00:00 2001 From: Yaojia Wang Date: Mon, 11 Aug 2025 16:42:36 +0200 Subject: [PATCH] Prepare vectorizes --- app/agents/__init__.py | 4 +-- app/agents/vectorization_agent.py | 57 ++++++++++++++++-------------- app/core/ocr.py | 23 ++++++++++++ app/routers/documents.py | 33 +++++++++++------ chroma_db/chroma.sqlite3 | Bin 0 -> 163840 bytes 5 files changed, 79 insertions(+), 38 deletions(-) create mode 100644 app/core/ocr.py create mode 100644 chroma_db/chroma.sqlite3 diff --git a/app/agents/__init__.py b/app/agents/__init__.py index 49b8c66..9c99e07 100644 --- a/app/agents/__init__.py +++ b/app/agents/__init__.py @@ -1,4 +1,4 @@ -# app/agents/__init__.py from .classification_agent import agent_classify_document_from_image from .receipt_agent import agent_extract_receipt_info -from .invoice_agent import agent_extract_invoice_info \ No newline at end of file +from .invoice_agent import agent_extract_invoice_info +from .vectorization_agent import agent_vectorize_and_store \ No newline at end of file diff --git a/app/agents/vectorization_agent.py b/app/agents/vectorization_agent.py index 983e568..2677a75 100644 --- a/app/agents/vectorization_agent.py +++ b/app/agents/vectorization_agent.py @@ -1,38 +1,43 @@ # app/agents/vectorization_agent.py from langchain.text_splitter import RecursiveCharacterTextSplitter -from ..core.vector_store import vector_store, embedding_model +from langchain_openai import OpenAIEmbeddings +embedding_model = OpenAIEmbeddings(model="text-embedding-3-small") +import chromadb + +client = chromadb.PersistentClient(path="./chroma_db") +vector_store = client.get_or_create_collection(name="documents") -# Initialize the text splitter to divide long documents into smaller chunks text_splitter = RecursiveCharacterTextSplitter( - chunk_size=500, - chunk_overlap=50, + chunk_size=1000, + chunk_overlap=100, ) -def agent_vectorize_and_store(doc_id: str, text: str, category: str): - """Agent 4: Vectorization and Storage (Real Implementation)""" - print(f"--- [Agent 4] Vectorizing document (ID: {doc_id})...") +def agent_vectorize_and_store(doc_id: str, text: str, category: str, language: str): + """ + Agent 4: Vectorizes a document and stores it in ChromaDB. + """ + print(f"--- [Background Task] Starting vectorization (ID: {doc_id})...") - # 1. Split the document text into chunks - chunks = text_splitter.split_text(text) - print(f"--- [Agent 4] Document split into {len(chunks)} chunks.") - - if not chunks: - print(f"--- [Agent 4] Document is empty, skipping vectorization.") + try: return - # 2. Create a unique ID and metadata for each chunk - chunk_ids = [f"{doc_id}_{i}" for i in range(len(chunks))] - metadatas = [{"doc_id": doc_id, "category": category, "chunk_number": i} for i in range(len(chunks))] + chunks = text_splitter.split_text(text) + if not chunks: + print(f"--- [Background Task] document {doc_id} has no text to vectorize.") + return - # 3. Use an embedding model to generate vectors for all chunks - embeddings = embedding_model.embed_documents(chunks) + chunk_ids = [f"{doc_id}_{i}" for i in range(len(chunks))] + metadatas = [{"doc_id": doc_id, "category": category, "language": language, "chunk_number": i} for i in + range(len(chunks))] - # 4. Add the IDs, vectors, metadata, and text chunks to ChromaDB - vector_store.add( - ids=chunk_ids, - embeddings=embeddings, - documents=chunks, - metadatas=metadatas - ) + embeddings = embedding_model.embed_documents(chunks) - print(f"--- [Agent 4] document {doc_id} stored in ChromaDB。") + vector_store.add( + ids=chunk_ids, + embeddings=embeddings, + documents=chunks, + metadatas=metadatas + ) + print(f"--- [Background Task] Document {doc_id} vectorized and stored successfully.") + except Exception as e: + print(f"--- [background Task] Vectorization failed (ID: {doc_id}): {e}") diff --git a/app/core/ocr.py b/app/core/ocr.py new file mode 100644 index 0000000..1e42327 --- /dev/null +++ b/app/core/ocr.py @@ -0,0 +1,23 @@ +import pytesseract +from PIL import Image +from typing import List + + +def extract_text_from_images(images: List[Image.Image]) -> str: + """ + 使用Tesseract OCR从一系列图片中提取并合并所有文本。 + """ + print("--- [Core OCR] 正在从图片中提取文本用于向量化...") + full_text = [] + for img in images: + try: + # lang='chi_sim+eng' 表示同时识别简体中文和英文 + text = pytesseract.image_to_string(img, lang='chi_sim+eng') + full_text.append(text) + except Exception as e: + print(f"--- [Core OCR] 单页处理失败: {e}") + continue + + combined_text = "\n\n--- Page Break ---\n\n".join(full_text) + print("--- [Core OCR] 文本提取成功。") + return combined_text diff --git a/app/routers/documents.py b/app/routers/documents.py index 8d3e9ec..221cdb3 100644 --- a/app/routers/documents.py +++ b/app/routers/documents.py @@ -1,8 +1,6 @@ -# app/routers/documents.py import uuid import mimetypes -import base64 -from fastapi import APIRouter, UploadFile, File, HTTPException +from fastapi import APIRouter, UploadFile, File, HTTPException, BackgroundTasks from typing import Dict, Any, List from fastapi.concurrency import run_in_threadpool from PIL import Image @@ -10,6 +8,7 @@ from io import BytesIO from .. import agents from ..core.pdf_processor import convert_pdf_to_images, image_to_base64_str +from ..core.ocr import extract_text_from_images # Create an APIRouter instance router = APIRouter( @@ -46,8 +45,12 @@ async def multimodal_process_pipeline(doc_id: str, image: Image.Image, page_num: db_results[final_result["doc_id"]] = final_result return final_result -@router.post("/process", summary="upload and process a document") -async def upload_and_process_document(file: UploadFile = File(...)): + +@router.post("/process", summary="Upload and Process Document") +async def upload_and_process_document( + file: UploadFile = File(...), + background_tasks: BackgroundTasks = BackgroundTasks() +): if not file.filename: raise HTTPException(status_code=400, detail="No file provided.") @@ -57,7 +60,7 @@ async def upload_and_process_document(file: UploadFile = File(...)): try: file_type = mimetypes.guess_type(file.filename)[0] - print(f"File type: {file_type}") + print(f"Detected file type: {file_type}") images: List[Image.Image] = [] if file_type == 'application/pdf': @@ -84,18 +87,28 @@ async def upload_and_process_document(file: UploadFile = File(...)): elif category == "INVOICE": extraction_result = await agents.agent_extract_invoice_info(images_base64, language) else: - print(f"The document is classified as '{category}',skipping extraction。") + print(f"Document classified as '{category}',skipping extraction。") - # 3. Return a unified result final_result = { "doc_id": doc_id, + "message": "Document processing initiated. Vectorization is running in the background.", "page_count": len(images), "category": category, "language": language, "extraction_data": extraction_result.dict() if extraction_result else None, - "status": "Processed" + "status": "Processing" } - db_results[doc_id] = final_result + + full_text = await run_in_threadpool(extract_text_from_images, images) + background_tasks.add_task( + agents.agent_vectorize_and_store, + doc_id, + full_text, + category, + language + ) + print("--- [Main] Vectorization job added to background tasks.") + return final_result except Exception as e: diff --git a/chroma_db/chroma.sqlite3 b/chroma_db/chroma.sqlite3 new file mode 100644 index 0000000000000000000000000000000000000000..e43d25f94f799989ad26a0c60e55888e100cb429 GIT binary patch literal 163840 zcmeI5U2Gdip5WQ~AT7!^op=()aXc<7lSmw^Wp?vBnjovlv`2_OVu^~A-8m4=?xMss z*_7E$+nQ`1C^^~9+}_JAu)tw~eK_D=Z-E^gkePW|;D7_}WpKEc+m{{ej^UI1RAGY^T*kpHAb@jh~|EhoetGcVSb7#F_Q&MktTAEEVkqZ$?iu?f~ zkw_#4{}~?oa(_^46p@@%cnW`i=BK1Z6|;}dMV-^$rE9Td zsU-C_Y^~O$v{j>q(J=R_^=`9i(?@o-VLTch`r7Jtd8JY&m6dnb%Vd6dqWJ}j?lp`> zV$)7*ksR!q?M3qb_S(kE_DAHs@<(EHDn;(DRerp+TOr$9ch_$Arb#Thye##ujgrmK zY;DMWS@7j2O2<;oFEk9Ywpl6PE^musEs|QZt=}W>u5Z1YvaiOHigKp+(g-mIC2;Tk z!P<6ZcV*orG?Dp*zP z+HL2&+R`3XEqWicQ&ocFS07x7C9hqRdId++AR;)r6yg$ulZ7~e4`{1JO{R5HDSucA zF>w>Hn>t{~=2nGl?yj%LQ>(8~u>u-aRTWNN%3VC^Wr}dw_m0#TW69TFmmXhtC6(rET}t+v?V8qX9y%7r)LQ+OLr4l(2s4m1&7w|n0}z@Ps~K)L z#an}yW}pocq3(Ez9Q#JLHlc~Fm!ZdAUD;V(xmjK$dfTiw_NsquwaqG+IKi@ct&K^$ z=u5<#nesw3S-v_-MgC2zs(<*Ra^Yl`DE=ic9OV+RhPCi|L2+Gg_* zzNoJc@~~m=Lz=zD18O=M#8dM5XmaK9DDAkSRTcLj@so40KJGk}&qR|SWk&H0ER_$mDwsb% zaOOyk#gdtf^jLBz54mng(3ueHAqiaELX1aTuu(zm)5$!L51RZU;n;g`PR5e+^U~3* z!%NfecN-mAeb@mfJ#;b^VibyU=`g3;`v`8Tpm$!Ayfo~W4NbsaYj)T2dIi68b=shp zIF*?g=UlcLd%P0{pC~w3 zoc?m^;^f_lM5HN>xxb#{V=@ydrh2<$Smfb8bmX2d1A6kE_P}Gdhz8yPIErk#i5g3B(CUg!n6FLRb>a-Nz+Z$Z z7p7{b2@^PQoQJMO4Pv)R$GIJOpy}OiOUSxU<-+~hD8f-XH&S1Nrw<&pG%e6lMo|nkc||Sgy%Kaw(w&~hImQ^rf>_15{~9K_W4Aj{xrI8-Q+3_(hS6{a zT3V+;t)-CSgG(&3a`Prx-CEz>*c|F4FUJ+V0Lrh`v`jXyWmQcn*791dP*?O)u>_N7 z$27{C>1S_n*`rMMyePZZVm;@luM~Vn(P%?apdk~hdvMV1fIsNW&Jlf|!t50H{$QG} z!_~Y<^$^15pxd`oAjYuM>NxbvK}156ckh)?qrfS z7`!F2W;-)}7v+3U0$)07aIqnP9OyV?q;Mi@MOJHg!(GCup4V{nlobHj;K9)PL zmnNl9=^+TlINayu_vOOcbPsye!%Ww zSN*vF&?Ts3htGGoE!m2Xtg?<_bL*^gUdq?iLXFl9 zT`{t%q5kah6l;io;a27;R>-XE4hUq=TvMLn!aUUQNfd8{ zUZ5F8uVcJXJXUky8tY`B>lNq=s{6uks9=z5qyI zvsRtCNMg{s2FB7GWwQq77s2%{wzgq_`2PCJYMHwk{y3P;$9=G}z6(>y3-fM9;54w| z`h04cF=Q67?+(y?LTB8coV39vc6iUeySsMN@*JD~n|s1R^I^jgUOs+LHaQc>!6;`3tIj)wJj1C@SE}cokP@_*vqU#6dz&e31C3#Q#YACh=br z|0(gy#J^Ac+r(dfJu?893lcyANB{{S0VIF~kN^@u0!RP}Ac3cxz{^vzB>69pT2**E zu1hB_yfh`x3J`uOW%k6jY$~3PUayF|| zxl}9|GHkqEqB>P{t(NXd()4cQ(ryX%Pt*R`e$JF8m_3;s^k2e5zq*7iwc!1w>3$106wLIOwt2_OL^ zfCP{L5m5s#T|KCRvfB&?V5amGvNB{{S0VIF~kN^@u0!RP}AOR$R1fE1-esXul zn*r#X7;sL#IR5_sKSUD$@T43u9tj`;B!C2v01`j~NB{{S0VIF~kN^^RDhbR@rR2W% z{k^CE-rE4q`~T;OI>$wtF?Qdz>;1 zT2hsIQKos-kh58x%B5n#kn@>ZiRx6*wOZO}>)jSLZOeNqKJJP6$8!&;ZnrzthH218 zb8me-w{Kbx*{^k=>2&U`xu!a|I7jPMy=_`{r>onIwmJ8fl1@9J7S%fXzI)!%9#u`+ z*xRqQI~IiH7w0l(WW1TCqHbJn4$SM7a>+HW@v=i2lqpFno48a0fDxmT@s zP4_+@&)J6uOpJSG`=JSvm|BaSx+8xG0J-XoJ&^d5Pd?$#|MC6*XRU&xyhs2EAOR$R z1dsp{Kmter2_OL^fCLx;_Wu7#rat)i|JRYkuaO%PKmter2_OL^fCP{L5Y&P&A`V06N$fp{r}(#KS%%xAOR$R1dsp{Kmter2_OL^fCP}h(?DQh za#yO=bGc$BpOaNRTadGLO_gf}s>`LcLTl*)EC?tSCfM`;Qi0~PnQTth3-H$gvc-@-Oe7Jd&};0>Ec|~K0JVvxpJ!pTLCuAJ?l=FcIj%{tT*<$9k{4%u9-INm|Ao0 z6MX*v-6#_8iUg1V5Yq4ahB=t6It=6Q} zdljKt?>3t@ePmY~#-rh(udQyES1M&vS$TK8Oy*f4@eaZ9*Lx4=O^J|r^9vT;YZ!|J zUX^Gql7l_7y-42QUfWpN{)oI+{z!~YrO4g2%8$2pD`b1??%K`XG>Ijbm!;mdQRO!@ zTN?s^1z#Q{&?g;BHNVg>$l7M5e7n3YinT~;&9;7zyt}^jZpywIODf8l-b*9I7?i-h z_Xlg+mEDzfR||v5$nMVC=5129tsKa)NG!VpZ<$MC5y8==5SJjFEW`rDt7cprzv%`apYE z4PKKBm1Z#BEtR0;?eeYic6oEPyhHrt7SK;EI>oz34|&18vhiS1Gfq4YZU8AqTV5UF z!S!Za^CF-fuJcf#))PnFaIaK6m3eiXo&y;LWY+A_#Y>K&kMD<3SwZ^va5UB+5T#=5-4n{BB z`;CPV&UwLe0m9*JguOd9v=-^@;SqYf*`zvqr;J}FRMH@dRZ^c;JoWa=(d3=$qYB&7 zpl(uAGj&>Rw)dEk2n9rYFTE5?UcWB&pfy70(ih!#IvoPh7wOR8Ev(;M*o_voY^`;` z+J`SY(X#n$xFG5C%sG&F8kh*`eH2k`s>o;>kb7W zN;te2D*c=gGp;5OStebxalcE+Ub9_euNXQO#?)HW7ONR< zH^p0nmu8?15~1#Ri5&Yzwl<-Ot(UeB^=6bsQYt1MA!`+ z!#7ki$0EOTHk#a#M+xH^)T-CN`_7PlXfb|NznV!@^AhmY8In0$1`sQ%g$DN1rnP~E(%qYHrrSgGR1@q?z z&K$|HSTd859!n18A=eEFIul|&B!P=ti1COEHY$jHI++LZL6cu39DDE0$yjoJUOJj} zcxn3mZlgo14?EzbhfbzKj6zW^9p-dizLxqO^*N*BZgM-!oP1>r*qWi)ud;!8MKo(Hre_HbiT z)*DSQhXWmGsL-tNFkT8f7K1khgS!8uG=dh$0u*GCxT2;)cTAm&{UnlfHZ3_v^D?oIQ?i&;Iz#EAhXIeLVdirl+RrlmBt@2Q$B$dFRaM((h*f`{WOT zaX215Qv{CEOHQ}+_@dK9{2`H51%Iy_Ox3vO*Fw#uKegX2IV0;~vk^{h`@Kd_fxqqL zA0D*X?SqCstT}ilJDmIhj8@tmXN<>N7B%%YTP7p8kY2Cu1jSRm*M1mFZfr@AIx}2-^E;vJ<`?dT1}Ee}|6~(M#gbcFQtw7sqXXg@ZQW}8 zG*I!V1oI~q(9Ll4RFw5oJ`dHqyud|#;|U@HL2%aWu?Sg$H%}^}o8f2?^>nJYG#^Xe zzAg3s=n2vdpR{zf1(9!@R4PdOdrdFC`erP-wsr!I!+2~oOE&QM!AV7CiAPfu<{Xb= z?pzoj{RFr7TGe34Gcp%;GFLb7wc`IJay9wRJ5ujTs9r^HxRd0sz7cbxS3}Wz{mnAq z<V+?hiP1JtXlJ_wmTi`I#T|eZ7}Wn zHBHC5Mm^Qb$g$*miqw1POcMu%_NomV=KFKZgVDr=v9@`${Gl+tHuQ2fqc9|k1@Mvk z=CaA%AH$?;Xnvcl?f8CP@5UmuM3DOaD5-@R)xkSnd6Lk=)njEA(n0Uwh%5w!B%Z$> zOO`=mI!a=H;KvgED?m}2AqnMgBoe*5g7&Hmr$KTdx+b#e0UL;`}o z_Rleyi6tfa>E#XU&@yT>Y50_6cioa4ELV}uwk?~mQ@|2mJkEY#aam=*VUd=0k6PYZ zZ)+*Ih?s5DxrnU^T?#KD_LfFf;l<*E;60`9a0%>kJifNMQ{JwyRWMtAdANW(u-0+4 zLVieYZEtP(am(?W+gtCuEAqV(Eyq{ZE3g3o-z~ul<=Mbem{1c6w+#~p372Ag)=3W&(xKp@)=Ok#_=ayMQ50bin6e5R)!_HVYO^iz1=Y^@^Bv( z@_B0+K~L~bA78~AteCZk2CD;Mp=iC`X;EV-4q9EY30rGR+I8YA6M*%i@Ioq6wezAH zfdgmpVArAsyr@b#&h5wpP49MFLe_mM7w*qi5j#rfucn1wribMTr&#F=9JMqp&{9TG z6fLb3iY3i3>iKLbujqM2E$F=xEK8H_^fb;f#yA$lD$ZYp3zOV|g$Yo(g*wesxqq~S z7!7Bkwbp4+Ybm7o;1Y|h+`LIvx7K$zHis^fU5+by0hC{>X_;(Z%c`1EtmU;@p|0qq zVhR3c$T5wwX8PG1T=poFJuk}cwOH@f+P+fUS%xz}hm-5WdEvurizoo=d`#98?c9J?)Z2}}(o+QC5+ z{=SOyPMWhkj@3SvJFk}}rBLZ12-Z0BHLv`>Tv(g#!HT-WQCjlXFN)e4Tu3>zb)Wd! zETvi@n=h6!xmsP#8QFX~tEKDIP&4_wqU3wEY0&fSqqn)9rBr>g3oxcv`e(8Bdpst=zE$n9UHIu?d8%+$tc_e$ z_DybT;N01(Jj1px07zi7R-L^_h(YTb7)x)JJ!^1&5nSJ5Ya14bzrVh+TIOzsTkY8% zE-CWC%K9#BC9^Q^W&}WvLXnrF}99b-LtbN;H^+63*NFA@!? z@|C7Tw*g?yDxT`MWUh=sw>^Bjs;6Kp2M1MX;kYMzu%>M}u9lRH4jyAJok_z`qLgDg z%w~(}Oo0|lih6t<*5J>SKi%P)WQtUTIyF+40pcfiu%p(|2U_zNXmRz4Ms^56AxAWx4i?zpsGTWdb>R=$1}xDPBjW@PBRo;$)^k1x~6B9_ysy>lTZOaG$x-bVlWBG(oK!LqYb9qJVOi-v+_3i(+< za=x(LUEvg13U4*JP~p9{&hia!EyS8NIGA8U>-AcO=3umB6l>sR=88Ebtrb;T%pKiL z#N?UP$H{>b@HP-gbW)jJVcl)XMhpD=Y)Z4QI$%bB8Lh78QDk`*OpOVX{GhDt1x9f>p7HVhaw#SY6+$ zZNgB|CPh*^ghA}XhCxZKu_sg0Xuu$m&1vkxP!a~Ku;RJKhJPlE5Mf-J&q6vFYw`qq zX2Z}9V9z){KH9hK1M96DH(>7*{T>^d!t%_AFn!g!q1}La7??OH6_?_>@84v5UkWns zlq)P#PuRDeD&cw`ydwc`WT9cg;@E|GMa^V$d6eg7yK79nc&#_G! zQj0`Z$Rbf-%gP`9h>*0qAIP74)l5_&lk90T$q1PgPn$_L$V5?3n@LW{B!AjW@8s5k?|4N7>bMzCD&ki?zV5p0+yLYxv#i8*kzf%s$MC!enUhoFKHIK0JHl_rJcA zS7lQ1T~Nnu2;iq0yIg(Qo9E$+O`wKzhg88p1fw-Z<*!B8iLQe)U zEGz_c+WCAMI_?~$s&1rVbfD9EhSrOD@Rn%t=LEQ1xS;p_vl*_N*KbE~a3H)G2X9wryp&<{Omk*TS zkrQ3T!D;!pqH4K9x?WIJFj`tsr-fWvQ?-Jg$s3xQu7A1^k6oJC`TT8HLw)}LFa?bm zyPROS5{%J@ZGsh)%^SaZ+YRu+j_U-o-NO6E%)i&BUEY8@5N=@0)tFC&qOq?Puv2#zYs$IH+-6BV6-_*0pcXm zlT4nttIvRWg4vo{Rrb4@g;oW2MHhn zB!C2v01`j~NB{{S0VIF~kN^^R9tgzalaW{5f=o)2k+Zn}-}9hWECUii0!RP}AOR$R z1dsp{Kmter2_S*-1ibzK#;3wqB!C2v01`j~NB{{S0VIF~kN^@u0!ZLlCV=DrXIXnF zI}$(wNB{{S0VIF~kN^@u0!RP}Ac2z+VDJA+@W=n~@Bf{Q7p6i2NB{{S0VIF~kN^@u T0!RP}AOR%stPl|Y{@?!x3WhyK literal 0 HcmV?d00001