From 58bf75db6829844ac4d077b060dbedcf1210ec1b Mon Sep 17 00:00:00 2001 From: Yaojia Wang Date: Tue, 27 Jan 2026 00:47:10 +0100 Subject: [PATCH] WIP --- .claude/settings.json | 5 +- .claude/settings.local.json | 8 +- .coverage | Bin 0 -> 77824 bytes README.md | 34 +- create_shims.sh | 96 + docs/CODE_REVIEW_REPORT.md | 405 -- docs/FIELD_EXTRACTOR_ANALYSIS.md | 96 - docs/MACHINE_CODE_PARSER_ANALYSIS.md | 238 -- docs/PERFORMANCE_OPTIMIZATION.md | 519 --- docs/REFACTORING_PLAN.md | 1447 ------- docs/REFACTORING_SUMMARY.md | 170 - docs/TEST_COVERAGE_IMPROVEMENT.md | 258 -- docs/multi_pool_design.md | 619 --- docs/product-plan-v2.md | 1223 ++++++ docs/ux-design-prompt-v2.md | 302 ++ docs/web-refactoring-complete.md | 273 ++ docs/web-refactoring-plan.md | 186 + docs/web-refactoring-status.md | 218 + frontend/.env.example | 5 + frontend/.gitignore | 24 + frontend/README.md | 20 + frontend/REFACTORING_PLAN.md | 240 ++ frontend/SETUP.md | 256 ++ frontend/index.html | 15 + frontend/metadata.json | 5 + frontend/package-lock.json | 3510 +++++++++++++++++ frontend/package.json | 32 + frontend/postcss.config.js | 6 + frontend/src/App.tsx | 73 + frontend/src/api/client.ts | 41 + frontend/src/api/endpoints/annotations.ts | 66 + frontend/src/api/endpoints/documents.ts | 80 + frontend/src/api/endpoints/index.ts | 4 + frontend/src/api/endpoints/inference.ts | 16 + frontend/src/api/endpoints/training.ts | 74 + frontend/src/api/types.ts | 173 + frontend/src/components/Badge.tsx | 39 + frontend/src/components/Button.tsx | 38 + frontend/src/components/Dashboard.tsx | 266 ++ frontend/src/components/DashboardOverview.tsx | 148 + frontend/src/components/DocumentDetail.tsx | 504 +++ frontend/src/components/InferenceDemo.tsx | 466 +++ frontend/src/components/Layout.tsx | 102 + frontend/src/components/Login.tsx | 188 + frontend/src/components/Models.tsx | 134 + frontend/src/components/Training.tsx | 113 + frontend/src/components/UploadModal.tsx | 210 + frontend/src/hooks/index.ts | 4 + frontend/src/hooks/useAnnotations.ts | 70 + frontend/src/hooks/useDocumentDetail.ts | 25 + frontend/src/hooks/useDocuments.ts | 78 + frontend/src/hooks/useTraining.ts | 83 + frontend/src/main.tsx | 23 + frontend/src/styles/index.css | 26 + frontend/src/types/index.ts | 48 + frontend/tailwind.config.js | 47 + frontend/tsconfig.json | 29 + frontend/vite.config.ts | 16 + requirements.txt | 4 + src/cli/analyze_labels.py | 2 +- src/cli/analyze_report.py | 2 +- src/cli/autolabel.py | 2 +- src/cli/import_report_to_db.py | 2 +- src/cli/infer.py | 7 +- src/cli/reprocess_failed.py | 3 +- src/cli/serve.py | 6 +- src/cli/train.py | 6 +- config.py => src/config.py | 8 +- src/data/admin_db.py | 1156 ++++++ src/data/admin_models.py | 339 ++ src/data/async_request_db.py | 374 ++ src/data/database.py | 103 + src/data/db.py | 2 +- src/data/migrations/001_async_tables.sql | 83 + .../migrations/002_nullable_admin_token.sql | 5 + src/data/models.py | 95 + src/processing/autolabel_tasks.py | 6 +- src/validation/llm_validator.py | 4 +- src/web/admin_routes_new.py | 8 + src/web/api/__init__.py | 0 src/web/api/v1/__init__.py | 0 src/web/api/v1/admin/__init__.py | 19 + src/web/api/v1/admin/annotations.py | 644 +++ src/web/api/v1/admin/auth.py | 82 + src/web/api/v1/admin/documents.py | 551 +++ src/web/api/v1/admin/locks.py | 184 + src/web/api/v1/admin/training.py | 622 +++ src/web/api/v1/batch/__init__.py | 0 src/web/api/v1/batch/routes.py | 236 ++ src/web/api/v1/public/__init__.py | 16 + src/web/api/v1/public/async_api.py | 372 ++ .../{routes.py => api/v1/public/inference.py} | 13 +- src/web/api/v1/public/labeling.py | 203 + src/web/app.py | 158 +- src/web/config.py | 48 +- src/web/core/__init__.py | 28 + src/web/core/auth.py | 60 + src/web/core/autolabel_scheduler.py | 153 + src/web/core/rate_limiter.py | 211 + src/web/core/scheduler.py | 329 ++ src/web/dependencies.py | 133 + src/web/rate_limiter.py | 211 + src/web/schemas.py | 86 - src/web/schemas/__init__.py | 11 + src/web/schemas/admin.py | 539 +++ src/web/schemas/common.py | 15 + src/web/schemas/inference.py | 196 + src/web/schemas/labeling.py | 13 + src/web/services/__init__.py | 18 + src/web/services/async_processing.py | 383 ++ src/web/services/autolabel.py | 335 ++ src/web/services/batch_upload.py | 548 +++ src/web/services/db_autolabel.py | 531 +++ .../{services.py => services/inference.py} | 6 +- src/web/workers/__init__.py | 24 + src/web/workers/async_queue.py | 181 + src/web/workers/batch_queue.py | 225 ++ src/yolo/db_dataset.py | 3 +- tests/data/test_admin_models_v2.py | 524 +++ tests/test_config.py | 12 +- tests/web/__init__.py | 1 + tests/web/conftest.py | 132 + tests/web/test_admin_annotations.py | 197 + tests/web/test_admin_auth.py | 162 + tests/web/test_admin_routes.py | 164 + tests/web/test_admin_routes_enhanced.py | 351 ++ tests/web/test_admin_training.py | 247 ++ tests/web/test_annotation_locks.py | 276 ++ tests/web/test_annotation_phase5.py | 420 ++ tests/web/test_async_queue.py | 217 + tests/web/test_async_routes.py | 409 ++ tests/web/test_async_service.py | 266 ++ tests/web/test_autolabel_with_locks.py | 250 ++ tests/web/test_batch_queue.py | 282 ++ tests/web/test_batch_upload_routes.py | 368 ++ tests/web/test_batch_upload_service.py | 221 ++ tests/web/test_inference_api.py | 298 ++ tests/web/test_inference_service.py | 297 ++ tests/web/test_rate_limiter.py | 154 + tests/web/test_training_phase4.py | 384 ++ update_test_imports.py | 68 + 141 files changed, 24814 insertions(+), 3884 deletions(-) create mode 100644 .coverage create mode 100644 create_shims.sh delete mode 100644 docs/CODE_REVIEW_REPORT.md delete mode 100644 docs/FIELD_EXTRACTOR_ANALYSIS.md delete mode 100644 docs/MACHINE_CODE_PARSER_ANALYSIS.md delete mode 100644 docs/PERFORMANCE_OPTIMIZATION.md delete mode 100644 docs/REFACTORING_PLAN.md delete mode 100644 docs/REFACTORING_SUMMARY.md delete mode 100644 docs/TEST_COVERAGE_IMPROVEMENT.md delete mode 100644 docs/multi_pool_design.md create mode 100644 docs/product-plan-v2.md create mode 100644 docs/ux-design-prompt-v2.md create mode 100644 docs/web-refactoring-complete.md create mode 100644 docs/web-refactoring-plan.md create mode 100644 docs/web-refactoring-status.md create mode 100644 frontend/.env.example create mode 100644 frontend/.gitignore create mode 100644 frontend/README.md create mode 100644 frontend/REFACTORING_PLAN.md create mode 100644 frontend/SETUP.md create mode 100644 frontend/index.html create mode 100644 frontend/metadata.json create mode 100644 frontend/package-lock.json create mode 100644 frontend/package.json create mode 100644 frontend/postcss.config.js create mode 100644 frontend/src/App.tsx create mode 100644 frontend/src/api/client.ts create mode 100644 frontend/src/api/endpoints/annotations.ts create mode 100644 frontend/src/api/endpoints/documents.ts create mode 100644 frontend/src/api/endpoints/index.ts create mode 100644 frontend/src/api/endpoints/inference.ts create mode 100644 frontend/src/api/endpoints/training.ts create mode 100644 frontend/src/api/types.ts create mode 100644 frontend/src/components/Badge.tsx create mode 100644 frontend/src/components/Button.tsx create mode 100644 frontend/src/components/Dashboard.tsx create mode 100644 frontend/src/components/DashboardOverview.tsx create mode 100644 frontend/src/components/DocumentDetail.tsx create mode 100644 frontend/src/components/InferenceDemo.tsx create mode 100644 frontend/src/components/Layout.tsx create mode 100644 frontend/src/components/Login.tsx create mode 100644 frontend/src/components/Models.tsx create mode 100644 frontend/src/components/Training.tsx create mode 100644 frontend/src/components/UploadModal.tsx create mode 100644 frontend/src/hooks/index.ts create mode 100644 frontend/src/hooks/useAnnotations.ts create mode 100644 frontend/src/hooks/useDocumentDetail.ts create mode 100644 frontend/src/hooks/useDocuments.ts create mode 100644 frontend/src/hooks/useTraining.ts create mode 100644 frontend/src/main.tsx create mode 100644 frontend/src/styles/index.css create mode 100644 frontend/src/types/index.ts create mode 100644 frontend/tailwind.config.js create mode 100644 frontend/tsconfig.json create mode 100644 frontend/vite.config.ts rename config.py => src/config.py (91%) create mode 100644 src/data/admin_db.py create mode 100644 src/data/admin_models.py create mode 100644 src/data/async_request_db.py create mode 100644 src/data/database.py create mode 100644 src/data/migrations/001_async_tables.sql create mode 100644 src/data/migrations/002_nullable_admin_token.sql create mode 100644 src/data/models.py create mode 100644 src/web/admin_routes_new.py create mode 100644 src/web/api/__init__.py create mode 100644 src/web/api/v1/__init__.py create mode 100644 src/web/api/v1/admin/__init__.py create mode 100644 src/web/api/v1/admin/annotations.py create mode 100644 src/web/api/v1/admin/auth.py create mode 100644 src/web/api/v1/admin/documents.py create mode 100644 src/web/api/v1/admin/locks.py create mode 100644 src/web/api/v1/admin/training.py create mode 100644 src/web/api/v1/batch/__init__.py create mode 100644 src/web/api/v1/batch/routes.py create mode 100644 src/web/api/v1/public/__init__.py create mode 100644 src/web/api/v1/public/async_api.py rename src/web/{routes.py => api/v1/public/inference.py} (96%) create mode 100644 src/web/api/v1/public/labeling.py create mode 100644 src/web/core/__init__.py create mode 100644 src/web/core/auth.py create mode 100644 src/web/core/autolabel_scheduler.py create mode 100644 src/web/core/rate_limiter.py create mode 100644 src/web/core/scheduler.py create mode 100644 src/web/dependencies.py create mode 100644 src/web/rate_limiter.py delete mode 100644 src/web/schemas.py create mode 100644 src/web/schemas/__init__.py create mode 100644 src/web/schemas/admin.py create mode 100644 src/web/schemas/common.py create mode 100644 src/web/schemas/inference.py create mode 100644 src/web/schemas/labeling.py create mode 100644 src/web/services/__init__.py create mode 100644 src/web/services/async_processing.py create mode 100644 src/web/services/autolabel.py create mode 100644 src/web/services/batch_upload.py create mode 100644 src/web/services/db_autolabel.py rename src/web/{services.py => services/inference.py} (97%) create mode 100644 src/web/workers/__init__.py create mode 100644 src/web/workers/async_queue.py create mode 100644 src/web/workers/batch_queue.py create mode 100644 tests/data/test_admin_models_v2.py create mode 100644 tests/web/__init__.py create mode 100644 tests/web/conftest.py create mode 100644 tests/web/test_admin_annotations.py create mode 100644 tests/web/test_admin_auth.py create mode 100644 tests/web/test_admin_routes.py create mode 100644 tests/web/test_admin_routes_enhanced.py create mode 100644 tests/web/test_admin_training.py create mode 100644 tests/web/test_annotation_locks.py create mode 100644 tests/web/test_annotation_phase5.py create mode 100644 tests/web/test_async_queue.py create mode 100644 tests/web/test_async_routes.py create mode 100644 tests/web/test_async_service.py create mode 100644 tests/web/test_autolabel_with_locks.py create mode 100644 tests/web/test_batch_queue.py create mode 100644 tests/web/test_batch_upload_routes.py create mode 100644 tests/web/test_batch_upload_service.py create mode 100644 tests/web/test_inference_api.py create mode 100644 tests/web/test_inference_service.py create mode 100644 tests/web/test_rate_limiter.py create mode 100644 tests/web/test_training_phase4.py create mode 100644 update_test_imports.py diff --git a/.claude/settings.json b/.claude/settings.json index 5bcb51f..cdbb849 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -7,7 +7,8 @@ "Edit(*)", "Glob(*)", "Grep(*)", - "Task(*)" + "Task(*)", + "Bash(wsl bash -c \"source ~/miniconda3/etc/profile.d/conda.sh && conda activate invoice-py311 && cd /mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2 && pytest tests/web/test_batch_upload_routes.py::TestBatchUploadRoutes::test_upload_batch_async_mode_default -v -s 2>&1 | head -100\")" ] } -} \ No newline at end of file +} diff --git a/.claude/settings.local.json b/.claude/settings.local.json index b4a6c77..da6d4f0 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -81,7 +81,13 @@ "Bash(wsl bash -c \"cat /mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2/runs/train/invoice_fields/results.csv\")", "Bash(wsl bash -c \"ls -la /mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2/runs/train/invoice_fields/weights/\")", "Bash(wsl bash -c \"cat ''/mnt/c/Users/yaoji/AppData/Local/Temp/claude/c--Users-yaoji-git-ColaCoder-invoice-master-poc-v2/tasks/b8d8565.output'' 2>/dev/null | tail -100\")", - "Bash(wsl bash -c:*)" + "Bash(wsl bash -c:*)", + "Bash(wsl bash -c \"cd /mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2 && source ~/miniconda3/etc/profile.d/conda.sh && conda activate invoice-py311 && python -m pytest tests/web/test_admin_*.py -v --tb=short 2>&1 | head -120\")", + "Bash(wsl bash -c \"cd /mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2 && source ~/miniconda3/etc/profile.d/conda.sh && conda activate invoice-py311 && python -m pytest tests/web/test_admin_*.py -v --tb=short 2>&1 | head -80\")", + "Bash(wsl bash -c \"cd /mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2 && source ~/miniconda3/etc/profile.d/conda.sh && conda activate invoice-py311 && python -m pytest tests/ -v --tb=short 2>&1 | tail -60\")", + "Bash(wsl bash -c \"source ~/miniconda3/etc/profile.d/conda.sh && conda activate invoice-py311 && cd /mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2 && python -m pytest tests/data/test_admin_models_v2.py -v 2>&1 | head -100\")", + "Bash(dir src\\\\web\\\\*admin* src\\\\web\\\\*batch*)", + "Bash(wsl bash -c \"source ~/miniconda3/etc/profile.d/conda.sh && conda activate invoice-py311 && cd /mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2 && python3 -c \"\"\n# Test FastAPI Form parsing behavior\nfrom fastapi import Form\nfrom typing import Annotated\n\n# Simulate what happens when data={''upload_source'': ''ui''} is sent\n# and async_mode is not in the data\nprint\\(''Test 1: async_mode not provided, default should be True''\\)\nprint\\(''Expected: True''\\)\n\n# In FastAPI, when Form has a default, it will use that default if not provided\n# But we need to verify this is actually happening\n\"\"\")" ], "deny": [], "ask": [], diff --git a/.coverage b/.coverage new file mode 100644 index 0000000000000000000000000000000000000000..482cda9d232c3133ab100ddeebe0741ac897fa38 GIT binary patch literal 77824 zcmeHQ3y>Ved7hb_eLv=ekPabBw}8b-NO!A42mwOqBq89C(CKlf({YmKW@qQ_M%tZO z&CH&3B91gN6n<1WuGm453qMj$fnE`1spEwXlU)CMuE4e& z4G>jJuPD}B{X3kZM{bqJ7a)40T4c*X5N)3{?~KJmvVYwaiMv zG4o~J@SYmh;pET}&=RWnqQRcsOm)tuEV zWm@YG?)t^TZ`zQ+bOw^K?wvggnNGt5IH|k|n(B=Lk|R zk-un5B*>L!7HF#oE?xbq;>_gd>rO6%;=R#W_vR%FDNa<72gq;i!7jm5`CSo@{R$NN z;GPG^r}O1z8r#V7~^)lLt!*JggMWn&a6oIrYqw)ICjoeSqe9 zoqKp{jrTZY@4cpN>rkzdYRkT@el_1aqu8VrvU~GI(=2O>;nxiiV#o%Wx}EoQm|rVx zZDQDS1Y>%(*VCz9xsYOh1TtMcf5Abeq8O#1z9QAIj|yc!$(%%vUx+}!d@p258?tB{ zW*s!ROG>Odet#7>A*dCl89)Y*0b~FfKn9QjWB?gJ1};AaBD}!IN&8=rzQsx3hacRK0b~FfKn9Qj zWB?gJ29N<{02x3AkO5@iEy+Meh^&z6(?ZsF@sZ`rf;j-2`Uf`j52PSgf^?CSE=oUr zOGXftMh1`pWB?gJ29N<{02x3AkO5=>89)Y*fd&Ja$O^vR4Irc<%acJbKop)3q@R)J zu8>~!4;q2+02x3AkO5=>89)Y*0b~FfKn9QjWB?gJ2Hx5XEF#P9b@dDID@Z#@Toy}z z<{yv~+>ile02x3AkO5=>89)Y*0b~FfKn9QjWMDo6(uC;V$`>kzQ&0<|HY|5om{ZJS zdSP033OmfQvcoKCRzWvrOf zSO{?##N`f(Zh-|KmiLE62Q64)__$`s-h#CNjqCr-PfEOk3?Ku@05X6KAOpw%GJp&q z1IPd}fDBwt3`F<{mnHB2^U`Y^{KpL$Kn9QjWB?gJ29N<{02x3AkO5=>89)XuLk1#| zY?7@1FJ-Uiq+dw?DScgfOzO&gH}_cXq1@@*mh69J|0er;*+Xy}H)H@AKn9QjWB?gJ z29N<{02x3Aen%KsFNu7zm=wxxu2m`NdZ{L)GVAYgx{+G+RUwm|+4oDuD^a6O#d7^v}j{6x8_j-oY zQ9kLeOh7bV-&(%~kPsi(K&QrU^T@vk;>kv$=so8@8{3=$O-^NmO`u7p>+B%+_Ya-r zt~%E>+nxI|dF1RC_<43E{5&v8qU|d-Ng%q!WD){j856kQ=kp0T+%BGb`I|4i@WM~v zCihy;_jt~MfE;%l{PBZvqB<8j_k-8YynY6dxO`@CE62S^4q@x!xCH!b-H1jImRx+1 zUljuxyR!m!>Y2*dp1v^f{GH@F;E=$BobtT>@Kpfy4wnf19*;NpW^dLFqV-X0I6Z)?Z zV1Ip1SjxHD_)>0i{1I(D%0mj|cryMkNf$U+`|rP{f0O=2`lj@p^fBov>4Nl^&>Qd> z>C4hjr0)ile02x3AkO5=>89)Y*0b~FfKn9QjWFT|~NOuLFl)PO$=k3y2Z89)Y*0c3zM zK;Hky_CFy3H)H@AKn9QjWB?gJ29N<{02x3AkO5@i@?{{Eem}Q{|12*(6+e=?BY81- zK6!iM2Z{5Ed*lBXe=_w!aWTy_gOTu&$N^P!heDP!RXcrpj*_wdxaf|DEu*ZR(u-wHhVN=>l&r#=5FhQ7 zqB@j}{$rv$7IJ)4(@-HM2K<#MQGNG|?xv7M6%|{9lwZgC)luD;(k#tTwE`$_JBs1h z0Q>g6qI(^S#*m;&L58oG>W(Y};B|XMw}%<90-tCt6aB8eS9H6X;Yvk-x+avUlK2Km zlHV`7z0A6)_N-hs75F4A=-6{ebT=?VDm4c_b6eEPvZYl`%K_k(;Q-rnh6 zR2{kjU{{3!2H#DdC7N}QiS7WaW?+VvscN<@Pbqp?D*^7(FqHJlGyq$|TJDnM)hi^g z1X)IQOy9n*WHm(s-N=xj>*7(-?PDcWF_iM$X|0*RuLu*>D?3Q(yJ$#smoVvS8dLf- zAa$`;^i=g#vmLXdS+Y^96k%6YEHGA(A>AjsD_FGEPN`azeD|Bgoh&c5FI;!=sOWBG z(XamDaLP%fYTg_7iSDW};2Xv8>cgVDISj-pT`M>IB-t#J0-%?*Myj*X0uI2}vsTM> zVDOV1?G&j!R~-=DHDM4zCD&TkGOL**=FBpbR+^)!j`x*%k+m>rLRa;wR@Mzt6GSFO zcLxgz6sj^;(F{i>$j$tlWi_{^0N*Ac@hoTyc$`GTDN$wI|&3Tj~#iXpo& zrC~em;7?_#^SOn2nqlZV|pJyi58de~~|*crf##v_JQ1?n8-uWH9-a z|wwnW{yb;da()gy2<`QVCl7a0ms*HrA4kAXXH0hLjd6WT9t+14OhG|ulvVK}CG>)K@uxr(7SqJY$QB|`BZE?AI7o^@% z7Tt;PrB=%|ds?^5R$*ICitdTvn9II z;m0p1)H_D)wi(eq5WYyS_pA|XlDBTPMOP1BklN;Bx5&KZi0Ez!UuMv&vgyq+{i0$V zpKtm%v-S-(h-ooFr9$#_s}*ic|8ZlP;$N%!WNkrFClhqIZ5}j z)+MBFf=)*8dAt_cPF5ek=ICX+Fhz<`N1jzIJ?OSfuvSt&Fy+a`#zl7{i&0bbo;0oF zq+#mUAosCm7Yf-??BjAtx2uYy&Uh5>Rz!Cvi{70rm||utYY$K-2~c!xVD-qGxRkax zvHENE)}Ln~pymzPUas$wGi*2pPpUB0v@B@Q1-%hzR_`XrNo`|2DOT6ahYecFGVSMP zST&~1o=V?6kQCM>Cr2m`vPLB6=Y^D{{p`K0eF{GH0k~*ZrOl3*JT#u1tzx!u8!@)RmNB-$myC_j4cS zq)$mlr9|$@+}`Z#*-vDLGp}a;G_x!HO8O7d+frXoeIT_Z`Ev4+e(9uyZ zR;j+r_JLZx4`hsmD`SCy+VOQDYE!tP0{?aYT7X^?0yJeC!#4otbs=DS`5TOPbPWI& zLI8BkiGwcavFibRL)hTJ!@gAj+#3RL zvkQ9k?EuM=V$IG`0Bwx_4_^ncESA+i{(s-K0NBG2bq}zS{zKOQTsJG6HwHYM4J9;{KK_4jDS!_r)^Tz*27KMn~82`Vg3s9G^=vp8D zFG_&K;y+T=S6km?WBh-24rHug(6(j#U(N#B4hFQQ#?_9)o#Po0z+$uux4p;G0L@a3 z_|WsaAx2XG%;JUk!0l5`_a_0C#b@?mTiU{7iBLG0-}g8Y2Vj;A>?_;W$leN18xg2E#0 z{~Nd`Iq8Ga`rJR|^xUHCr?Y!A--Z$VMd?qZH>N(H8ilt4tYlZ>sf3zX9Dgou#jlNh zG3LZpL_ZTf1fLQ7xHuH~QRI=xR^esgQDG1NzxG9b`@)yoKDX<{Z6IelbUAM{w`=-V5P2YUk>C_) ztuuzKTR@N=x}Z0l+jV#o$YiMy8dFYGNYvd*n z#nL6vWQ{7G4sHNc7Eif}+BhQ3jUzV#?p6k3I*jE~YEKOSJWJN^0N?3d=;C^i!{XO< zkkj5Vb*um~#>1s*YJqp5N&A0A^h=!d7t&Dfd%5@J2C`qwp2#lEJd>HpNa<(NrL>fK zCN-H#Bp*xOmv{w6|F^_nia#7LKp(*AST6*{4H-ZNkO5@iw}ye?8q^>xZA{WVPB&Wi zIRMBq>%apn70FT@u(Skuz44$8kSvad59#B{#hPCBS2LKf0GDM1!oyVz!*u+&NTxNI zkr;HI9X!FPH0ek5ySG{zh2vGgWogmUxJ5EgTZZKlYUB;daWh0{#hZ&!haNKknq|a~ zXidia;Wfn;Iu2C;n58qZ1-LOZ18hu`0hz@*T9AFFYgMw827n(r9;Vujt}Q^8j{!1E zS6_?TekwM5|3JV&9R#rCMo$2|9sqBf(gnxZ4B#@GNPtTxpK8jW!Jz0hpym zBFW_eL$;U!QYRDuWX;IISfH1UX>!LW0i3lsZUJu$H_9?Vv*zRg+Rw-2glY*NISK+; z8rp>Ec>)^xPaFx4cQ1p`x=$Pq8{C_o25;Ab#T=dhaF$*sniw(^Sg5~OK!!e#-Vf;8 z7#i*K(3>s@!5{oKFj9+N&U?aKZ9 zlAU1IH@rxMI=E6NSl=$9aa&!fYM9Wiu9V684j0k7lPhJgzGBz{OxM5_)|E3_klS3T za;R$4uGDeX7mHfd?%+xt3&XJMJ_MCj^-xTtgDW+|`U027NC4b$rKVZeXroccFTk}5%OYk!P}f!Q|OZG?5{H$Ni= zv)vA`E;{OSN@lyk)IPT5F{$KfSnDBKbhq(uo`ZWo>-)Hr(o`NE4cB z94q_u2D1tGvA*3)D4_H_$Z^6yU$Q@r4 core/auth.py +cat > src/web/admin_auth.py << 'EOF' +"""DEPRECATED: Import from src.web.core.auth instead""" +from src.web.core.auth import * # noqa: F401, F403 +EOF + +# admin_autolabel.py -> services/autolabel.py +cat > src/web/admin_autolabel.py << 'EOF' +"""DEPRECATED: Import from src.web.services.autolabel instead""" +from src.web.services.autolabel import * # noqa: F401, F403 +EOF + +# admin_scheduler.py -> core/scheduler.py +cat > src/web/admin_scheduler.py << 'EOF' +"""DEPRECATED: Import from src.web.core.scheduler instead""" +from src.web.core.scheduler import * # noqa: F401, F403 +EOF + +# admin_schemas.py -> schemas/admin.py +cat > src/web/admin_schemas.py << 'EOF' +"""DEPRECATED: Import from src.web.schemas.admin instead""" +from src.web.schemas.admin import * # noqa: F401, F403 +EOF + +# schemas.py -> schemas/inference.py + schemas/common.py +cat > src/web/schemas.py << 'EOF' +"""DEPRECATED: Import from src.web.schemas.inference or src.web.schemas.common instead""" +from src.web.schemas.inference import * # noqa: F401, F403 +from src.web.schemas.common import * # noqa: F401, F403 +EOF + +# services.py -> services/inference.py +cat > src/web/services.py << 'EOF' +"""DEPRECATED: Import from src.web.services.inference instead""" +from src.web.services.inference import * # noqa: F401, F403 +EOF + +# async_queue.py -> workers/async_queue.py +cat > src/web/async_queue.py << 'EOF' +"""DEPRECATED: Import from src.web.workers.async_queue instead""" +from src.web.workers.async_queue import * # noqa: F401, F403 +EOF + +# async_service.py -> services/async_processing.py +cat > src/web/async_service.py << 'EOF' +"""DEPRECATED: Import from src.web.services.async_processing instead""" +from src.web.services.async_processing import * # noqa: F401, F403 +EOF + +# batch_queue.py -> workers/batch_queue.py +cat > src/web/batch_queue.py << 'EOF' +"""DEPRECATED: Import from src.web.workers.batch_queue instead""" +from src.web.workers.batch_queue import * # noqa: F401, F403 +EOF + +# batch_upload_service.py -> services/batch_upload.py +cat > src/web/batch_upload_service.py << 'EOF' +"""DEPRECATED: Import from src.web.services.batch_upload instead""" +from src.web.services.batch_upload import * # noqa: F401, F403 +EOF + +# batch_upload_routes.py -> api/v1/batch/routes.py +cat > src/web/batch_upload_routes.py << 'EOF' +"""DEPRECATED: Import from src.web.api.v1.batch.routes instead""" +from src.web.api.v1.batch.routes import * # noqa: F401, F403 +EOF + +# admin_routes.py -> api/v1/admin/documents.py +cat > src/web/admin_routes.py << 'EOF' +"""DEPRECATED: Import from src.web.api.v1.admin.documents instead""" +from src.web.api.v1.admin.documents import * # noqa: F401, F403 +EOF + +# admin_annotation_routes.py -> api/v1/admin/annotations.py +cat > src/web/admin_annotation_routes.py << 'EOF' +"""DEPRECATED: Import from src.web.api.v1.admin.annotations instead""" +from src.web.api.v1.admin.annotations import * # noqa: F401, F403 +EOF + +# admin_training_routes.py -> api/v1/admin/training.py +cat > src/web/admin_training_routes.py << 'EOF' +"""DEPRECATED: Import from src.web.api.v1.admin.training instead""" +from src.web.api.v1.admin.training import * # noqa: F401, F403 +EOF + +# routes.py -> api/v1/routes.py +cat > src/web/routes.py << 'EOF' +"""DEPRECATED: Import from src.web.api.v1.routes instead""" +from src.web.api.v1.routes import * # noqa: F401, F403 +EOF + +echo "✓ Created backward compatibility shims for all migrated files" diff --git a/docs/CODE_REVIEW_REPORT.md b/docs/CODE_REVIEW_REPORT.md deleted file mode 100644 index a8bc692..0000000 --- a/docs/CODE_REVIEW_REPORT.md +++ /dev/null @@ -1,405 +0,0 @@ -# Invoice Master POC v2 - 代码审查报告 - -**审查日期**: 2026-01-22 -**代码库规模**: 67 个 Python 源文件,约 22,434 行代码 -**测试覆盖率**: ~40-50% - ---- - -## 执行摘要 - -### 总体评估:**良好(B+)** - -**优势**: -- ✅ 清晰的模块化架构,职责分离良好 -- ✅ 使用了合适的数据类和类型提示 -- ✅ 针对瑞典发票的全面规范化逻辑 -- ✅ 空间索引优化(O(1) token 查找) -- ✅ 完善的降级机制(YOLO 失败时的 OCR fallback) -- ✅ 设计良好的 Web API 和 UI - -**主要问题**: -- ❌ 支付行解析代码重复(3+ 处) -- ❌ 长函数(`_normalize_customer_number` 127 行) -- ❌ 配置安全问题(明文数据库密码) -- ❌ 异常处理不一致(到处都是通用 Exception) -- ❌ 缺少集成测试 -- ❌ 魔法数字散布各处(0.5, 0.95, 300 等) - ---- - -## 1. 架构分析 - -### 1.1 模块结构 - -``` -src/ -├── inference/ # 推理管道核心 -│ ├── pipeline.py (517 行) ⚠️ -│ ├── field_extractor.py (1,347 行) 🔴 太长 -│ └── yolo_detector.py -├── web/ # FastAPI Web 服务 -│ ├── app.py (765 行) ⚠️ HTML 内联 -│ ├── routes.py (184 行) -│ └── services.py (286 行) -├── ocr/ # OCR 提取 -│ ├── paddle_ocr.py -│ └── machine_code_parser.py (919 行) 🔴 太长 -├── matcher/ # 字段匹配 -│ └── field_matcher.py (875 行) ⚠️ -├── utils/ # 共享工具 -│ ├── validators.py -│ ├── text_cleaner.py -│ ├── fuzzy_matcher.py -│ ├── ocr_corrections.py -│ └── format_variants.py (610 行) -├── processing/ # 批处理 -├── data/ # 数据管理 -└── cli/ # 命令行工具 -``` - -### 1.2 推理流程 - -``` -PDF/Image 输入 - ↓ -渲染为图片 (pdf/renderer.py) - ↓ -YOLO 检测 (yolo_detector.py) - 检测字段区域 - ↓ -字段提取 (field_extractor.py) - ├→ OCR 文本提取 (ocr/paddle_ocr.py) - ├→ 规范化 & 验证 - └→ 置信度计算 - ↓ -交叉验证 (pipeline.py) - ├→ 解析 payment_line 格式 - ├→ 从 payment_line 提取 OCR/Amount/Account - └→ 与检测字段验证,payment_line 值优先 - ↓ -降级 OCR(如果关键字段缺失) - ├→ 全页 OCR - └→ 正则提取 - ↓ -InferenceResult 输出 -``` - ---- - -## 2. 代码质量问题 - -### 2.1 长函数(>50 行)🔴 - -| 函数 | 文件 | 行数 | 复杂度 | 问题 | -|------|------|------|--------|------| -| `_normalize_customer_number()` | field_extractor.py | **127** | 极高 | 4 层模式匹配,7+ 正则,复杂评分 | -| `_cross_validate_payment_line()` | pipeline.py | **127** | 极高 | 核心验证逻辑,8+ 条件分支 | -| `_normalize_bankgiro()` | field_extractor.py | 62 | 高 | Luhn 验证 + 多种降级 | -| `_normalize_plusgiro()` | field_extractor.py | 63 | 高 | 类似 bankgiro | -| `_normalize_payment_line()` | field_extractor.py | 74 | 高 | 4 种正则模式 | -| `_normalize_amount()` | field_extractor.py | 78 | 高 | 多策略降级 | - -**示例问题** - `_normalize_customer_number()` (第 776-902 行): -```python -def _normalize_customer_number(self, text: str): - # 127 行函数,包含: - # - 4 个嵌套的 if/for 循环 - # - 7 种不同的正则模式 - # - 5 个评分机制 - # - 处理有标签和无标签格式 -``` - -**建议**: 拆分为: -- `_find_customer_code_patterns()` -- `_find_labeled_customer_code()` -- `_score_customer_candidates()` - -### 2.2 代码重复 🔴 - -**支付行解析(3+ 处重复实现)**: - -1. `_parse_machine_readable_payment_line()` (pipeline.py:217-252) -2. `MachineCodeParser.parse()` (machine_code_parser.py:919 行) -3. `_normalize_payment_line()` (field_extractor.py:632-705) - -所有三处都实现类似的正则模式: -``` -格式: # # <Öre> > ## -``` - -**Bankgiro/Plusgiro 验证(重复)**: -- `validators.py`: `is_valid_bankgiro()`, `format_bankgiro()` -- `field_extractor.py`: `_normalize_bankgiro()`, `_normalize_plusgiro()`, `_luhn_checksum()` -- `normalizer.py`: `normalize_bankgiro()`, `normalize_plusgiro()` -- `field_matcher.py`: 类似匹配逻辑 - -**建议**: 创建统一模块: -```python -# src/common/payment_line_parser.py -class PaymentLineParser: - def parse(text: str) -> PaymentLineResult - -# src/common/giro_validator.py -class GiroValidator: - def validate_and_format(value: str, giro_type: str) -> str -``` - -### 2.3 错误处理不一致 ⚠️ - -**通用异常捕获(31 处)**: -```python -except Exception as e: # 代码库中 31 处 - result.errors.append(str(e)) -``` - -**问题**: -- 没有捕获特定错误类型 -- 通用错误消息丢失上下文 -- 第 142-147 行 (routes.py): 捕获所有异常,返回 500 状态 - -**当前写法** (routes.py:142-147): -```python -try: - service_result = inference_service.process_pdf(...) -except Exception as e: # 太宽泛 - logger.error(f"Error processing document: {e}") - raise HTTPException(status_code=500, detail=str(e)) -``` - -**改进建议**: -```python -except FileNotFoundError: - raise HTTPException(status_code=400, detail="PDF 文件未找到") -except PyMuPDFError: - raise HTTPException(status_code=400, detail="无效的 PDF 格式") -except OCRError: - raise HTTPException(status_code=503, detail="OCR 服务不可用") -``` - -### 2.4 配置安全问题 🔴 - -**config.py 第 24-30 行** - 明文凭据: -```python -DATABASE = { - 'host': '192.168.68.31', # 硬编码 IP - 'user': 'docmaster', # 硬编码用户名 - 'password': 'nY6LYK5d', # 🔴 明文密码! - 'database': 'invoice_master' -} -``` - -**建议**: -```python -DATABASE = { - 'host': os.getenv('DB_HOST', 'localhost'), - 'user': os.getenv('DB_USER', 'docmaster'), - 'password': os.getenv('DB_PASSWORD'), # 从环境变量读取 - 'database': os.getenv('DB_NAME', 'invoice_master') -} -``` - -### 2.5 魔法数字 ⚠️ - -| 值 | 位置 | 用途 | 问题 | -|---|------|------|------| -| 0.5 | 多处 | 置信度阈值 | 不可按字段配置 | -| 0.95 | pipeline.py | payment_line 置信度 | 无说明 | -| 300 | 多处 | DPI | 硬编码 | -| 0.1 | field_extractor.py | BBox 填充 | 应为配置 | -| 72 | 多处 | PDF 基础 DPI | 公式中的魔法数字 | -| 50 | field_extractor.py | 客户编号评分加分 | 无说明 | - -**建议**: 提取到配置: -```python -INFERENCE_CONFIG = { - 'confidence_threshold': 0.5, - 'payment_line_confidence': 0.95, - 'dpi': 300, - 'bbox_padding': 0.1, -} -``` - -### 2.6 命名不一致 ⚠️ - -**字段名称不一致**: -- YOLO 类名: `invoice_number`, `ocr_number`, `supplier_org_number` -- 字段名: `InvoiceNumber`, `OCR`, `supplier_org_number` -- CSV 列名: 可能又不同 -- 数据库字段名: 另一种变体 - -映射维护在多处: -- `yolo_detector.py` (90-100 行): `CLASS_TO_FIELD` -- 多个其他位置 - ---- - -## 3. 测试分析 - -### 3.1 测试覆盖率 - -**测试文件**: 13 个 -- ✅ 覆盖良好: field_matcher, normalizer, payment_line_parser -- ⚠️ 中等覆盖: field_extractor, pipeline -- ❌ 覆盖不足: web 层, CLI, 批处理 - -**估算覆盖率**: 40-50% - -### 3.2 缺失的测试用例 🔴 - -**关键缺失**: -1. 交叉验证逻辑 - 最复杂部分,测试很少 -2. payment_line 解析变体 - 多种实现,边界情况不清楚 -3. OCR 错误纠正 - 不同策略的复杂逻辑 -4. Web API 端点 - 没有请求/响应测试 -5. 批处理 - 多 worker 协调未测试 -6. 降级 OCR 机制 - YOLO 检测失败时 - ---- - -## 4. 架构风险 - -### 🔴 关键风险 - -1. **配置安全** - config.py 中明文数据库凭据(24-30 行) -2. **错误恢复** - 宽泛的异常处理掩盖真实问题 -3. **可测试性** - 硬编码依赖阻止单元测试 - -### 🟡 高风险 - -1. **代码可维护性** - 支付行解析重复 -2. **可扩展性** - 没有长时间推理的异步处理 -3. **扩展性** - 添加新字段类型会很困难 - -### 🟢 中等风险 - -1. **性能** - 懒加载有帮助,但 ORM 查询未优化 -2. **文档** - 大部分足够但可以更好 - ---- - -## 5. 优先级矩阵 - -| 优先级 | 行动 | 工作量 | 影响 | -|--------|------|--------|------| -| 🔴 关键 | 修复配置安全(环境变量) | 1 小时 | 高 | -| 🔴 关键 | 添加集成测试 | 2-3 天 | 高 | -| 🔴 关键 | 文档化错误处理策略 | 4 小时 | 中 | -| 🟡 高 | 统一 payment_line 解析 | 1-2 天 | 高 | -| 🟡 高 | 提取规范化到子模块 | 2-3 天 | 中 | -| 🟡 高 | 添加依赖注入 | 2-3 天 | 中 | -| 🟡 高 | 拆分长函数 | 2-3 天 | 低 | -| 🟢 中 | 提高测试覆盖率到 70%+ | 3-5 天 | 高 | -| 🟢 中 | 提取魔法数字 | 4 小时 | 低 | -| 🟢 中 | 标准化命名约定 | 1-2 天 | 中 | - ---- - -## 6. 具体文件建议 - -### 高优先级(代码质量) - -| 文件 | 问题 | 建议 | -|------|------|------| -| `field_extractor.py` | 1,347 行;6 个长规范化方法 | 拆分为 `normalizers/` 子模块 | -| `pipeline.py` | 127 行 `_cross_validate_payment_line()` | 提取到单独的 `CrossValidator` 类 | -| `field_matcher.py` | 875 行;复杂匹配逻辑 | 拆分为 `matching/` 子模块 | -| `config.py` | 硬编码凭据(第 29 行) | 使用环境变量 | -| `machine_code_parser.py` | 919 行;payment_line 解析 | 与 pipeline 解析合并 | - -### 中优先级(重构) - -| 文件 | 问题 | 建议 | -|------|------|------| -| `app.py` | 765 行;HTML 内联在 Python 中 | 提取到 `templates/` 目录 | -| `autolabel.py` | 753 行;批处理逻辑 | 提取 worker 函数到模块 | -| `format_variants.py` | 610 行;变体生成 | 考虑策略模式 | - ---- - -## 7. 建议行动 - -### 第 1 阶段:关键修复(1 周) - -1. **配置安全** (1 小时) - - 移除 config.py 中的明文密码 - - 添加环境变量支持 - - 更新 README 说明配置 - -2. **错误处理标准化** (1 天) - - 定义自定义异常类 - - 替换通用 Exception 捕获 - - 添加错误代码常量 - -3. **添加关键集成测试** (2 天) - - 端到端推理测试 - - payment_line 交叉验证测试 - - API 端点测试 - -### 第 2 阶段:重构(2-3 周) - -4. **统一 payment_line 解析** (2 天) - - 创建 `src/common/payment_line_parser.py` - - 合并 3 处重复实现 - - 迁移所有调用方 - -5. **拆分 field_extractor.py** (3 天) - - 创建 `src/inference/normalizers/` 子模块 - - 每个字段类型一个文件 - - 提取共享验证逻辑 - -6. **拆分长函数** (2 天) - - `_normalize_customer_number()` → 3 个函数 - - `_cross_validate_payment_line()` → CrossValidator 类 - -### 第 3 阶段:改进(1-2 周) - -7. **提高测试覆盖率** (5 天) - - 目标:70%+ 覆盖率 - - 专注于验证逻辑 - - 添加边界情况测试 - -8. **配置管理改进** (1 天) - - 提取所有魔法数字 - - 创建配置文件(YAML) - - 添加配置验证 - -9. **文档改进** (2 天) - - 添加架构图 - - 文档化所有私有方法 - - 创建贡献指南 - ---- - -## 附录 A:度量指标 - -### 代码复杂度 - -| 类别 | 计数 | 平均行数 | -|------|------|----------| -| 源文件 | 67 | 334 | -| 长文件 (>500 行) | 12 | 875 | -| 长函数 (>50 行) | 23 | 89 | -| 测试文件 | 13 | 298 | - -### 依赖关系 - -| 类型 | 计数 | -|------|------| -| 外部依赖 | ~25 | -| 内部模块 | 10 | -| 循环依赖 | 0 ✅ | - -### 代码风格 - -| 指标 | 覆盖率 | -|------|--------| -| 类型提示 | 80% | -| Docstrings (公开) | 80% | -| Docstrings (私有) | 40% | -| 测试覆盖率 | 45% | - ---- - -**生成日期**: 2026-01-22 -**审查者**: Claude Code -**版本**: v2.0 diff --git a/docs/FIELD_EXTRACTOR_ANALYSIS.md b/docs/FIELD_EXTRACTOR_ANALYSIS.md deleted file mode 100644 index 75d934d..0000000 --- a/docs/FIELD_EXTRACTOR_ANALYSIS.md +++ /dev/null @@ -1,96 +0,0 @@ -# Field Extractor 分析报告 - -## 概述 - -field_extractor.py (1183行) 最初被识别为可优化文件,尝试使用 `src/normalize` 模块进行重构,但经过分析和测试后发现 **不应该重构**。 - -## 重构尝试 - -### 初始计划 -将 field_extractor.py 中的重复 normalize 方法删除,统一使用 `src/normalize/normalize_field()` 接口。 - -### 实施步骤 -1. ✅ 备份原文件 (`field_extractor_old.py`) -2. ✅ 修改 `_normalize_and_validate` 使用统一 normalizer -3. ✅ 删除重复的 normalize 方法 (~400行) -4. ❌ 运行测试 - **28个失败** -5. ✅ 添加 wrapper 方法委托给 normalizer -6. ❌ 再次测试 - **12个失败** -7. ✅ 还原原文件 -8. ✅ 测试通过 - **全部45个测试通过** - -## 关键发现 - -### 两个模块的不同用途 - -| 模块 | 用途 | 输入 | 输出 | 示例 | -|------|------|------|------|------| -| **src/normalize/** | **变体生成** 用于匹配 | 已提取的字段值 | 多个匹配变体列表 | `"INV-12345"` → `["INV-12345", "12345"]` | -| **field_extractor** | **值提取** 从OCR文本 | 包含字段的原始OCR文本 | 提取的单个字段值 | `"Fakturanummer: A3861"` → `"A3861"` | - -### 为什么不能统一? - -1. **src/normalize/** 的设计目的: - - 接收已经提取的字段值 - - 生成多个标准化变体用于fuzzy matching - - 例如 BankgiroNormalizer: - ```python - normalize("782-1713") → ["7821713", "782-1713"] # 生成变体 - ``` - -2. **field_extractor** 的 normalize 方法: - - 接收包含字段的原始OCR文本(可能包含标签、其他文本等) - - **提取**特定模式的字段值 - - 例如 `_normalize_bankgiro`: - ```python - _normalize_bankgiro("Bankgiro: 782-1713") → ("782-1713", True, None) # 从文本提取 - ``` - -3. **关键区别**: - - Normalizer: 变体生成器 (for matching) - - Field Extractor: 模式提取器 (for parsing) - -### 测试失败示例 - -使用 normalizer 替代 field extractor 方法后的失败: - -```python -# InvoiceNumber 测试 -Input: "Fakturanummer: A3861" -期望: "A3861" -实际: "Fakturanummer: A3861" # 没有提取,只是清理 - -# Bankgiro 测试 -Input: "Bankgiro: 782-1713" -期望: "782-1713" -实际: "7821713" # 返回了不带破折号的变体,而不是提取格式化值 -``` - -## 结论 - -**field_extractor.py 不应该使用 src/normalize 模块重构**,因为: - -1. ✅ **职责不同**: 提取 vs 变体生成 -2. ✅ **输入不同**: 包含标签的原始OCR文本 vs 已提取的字段值 -3. ✅ **输出不同**: 单个提取值 vs 多个匹配变体 -4. ✅ **现有代码运行良好**: 所有45个测试通过 -5. ✅ **提取逻辑有价值**: 包含复杂的模式匹配规则(例如区分 Bankgiro/Plusgiro 格式) - -## 建议 - -1. **保留 field_extractor.py 原样**: 不进行重构 -2. **文档化两个模块的差异**: 确保团队理解各自用途 -3. **关注其他优化目标**: machine_code_parser.py (919行) - -## 学习点 - -重构前应该: -1. 理解模块的**真实用途**,而不只是看代码相似度 -2. 运行完整测试套件验证假设 -3. 评估是否真的存在重复,还是表面相似但用途不同 - ---- - -**状态**: ✅ 分析完成,决定不重构 -**测试**: ✅ 45/45 通过 -**文件**: 保持 1183行 原样 diff --git a/docs/MACHINE_CODE_PARSER_ANALYSIS.md b/docs/MACHINE_CODE_PARSER_ANALYSIS.md deleted file mode 100644 index d3df7ad..0000000 --- a/docs/MACHINE_CODE_PARSER_ANALYSIS.md +++ /dev/null @@ -1,238 +0,0 @@ -# Machine Code Parser 分析报告 - -## 文件概况 - -- **文件**: `src/ocr/machine_code_parser.py` -- **总行数**: 919 行 -- **代码行**: 607 行 (66%) -- **方法数**: 14 个 -- **正则表达式使用**: 47 次 - -## 代码结构 - -### 类结构 - -``` -MachineCodeResult (数据类) -├── to_dict() -└── get_region_bbox() - -MachineCodeParser (主解析器) -├── __init__() -├── parse() - 主入口 -├── _find_tokens_with_values() -├── _find_machine_code_line_tokens() -├── _parse_standard_payment_line_with_tokens() -├── _parse_standard_payment_line() - 142行 ⚠️ -├── _extract_ocr() - 50行 -├── _extract_bankgiro() - 58行 -├── _extract_plusgiro() - 30行 -├── _extract_amount() - 68行 -├── _calculate_confidence() -└── cross_validate() -``` - -## 发现的问题 - -### 1. ⚠️ `_parse_standard_payment_line` 方法过长 (142行) - -**位置**: 442-582 行 - -**问题**: -- 包含嵌套函数 `normalize_account_spaces` 和 `format_account` -- 多个正则匹配分支 -- 逻辑复杂,难以测试和维护 - -**建议**: -可以拆分为独立方法: -- `_normalize_account_spaces(line)` -- `_format_account(account_digits, context)` -- `_match_primary_pattern(line)` -- `_match_fallback_patterns(line)` - -### 2. 🔁 4个 `_extract_*` 方法有重复模式 - -所有 extract 方法都遵循相同模式: - -```python -def _extract_XXX(self, tokens): - candidates = [] - - for token in tokens: - text = token.text.strip() - matches = self.XXX_PATTERN.findall(text) - for match in matches: - # 验证逻辑 - # 上下文检测 - candidates.append((normalized, context_score, token)) - - if not candidates: - return None - - candidates.sort(key=lambda x: (x[1], 1), reverse=True) - return candidates[0][0] -``` - -**重复的逻辑**: -- Token 迭代 -- 模式匹配 -- 候选收集 -- 上下文评分 -- 排序和选择最佳匹配 - -**建议**: -可以提取基础提取器类或通用方法来减少重复。 - -### 3. ✅ 上下文检测重复 - -上下文检测代码在多个地方重复: - -```python -# _extract_bankgiro 中 -context_text = ' '.join(t.text.lower() for t in tokens) -is_bankgiro_context = ( - 'bankgiro' in context_text or - 'bg:' in context_text or - 'bg ' in context_text -) - -# _extract_plusgiro 中 -context_text = ' '.join(t.text.lower() for t in tokens) -is_plusgiro_context = ( - 'plusgiro' in context_text or - 'postgiro' in context_text or - 'pg:' in context_text or - 'pg ' in context_text -) - -# _parse_standard_payment_line 中 -context = (context_line or raw_line).lower() -is_plusgiro_context = ( - ('plusgiro' in context or 'postgiro' in context or 'plusgirokonto' in context) - and 'bankgiro' not in context -) -``` - -**建议**: -提取为独立方法: -- `_detect_account_context(tokens) -> dict[str, bool]` - -## 重构建议 - -### 方案 A: 轻度重构(推荐)✅ - -**目标**: 提取重复的上下文检测逻辑,不改变主要结构 - -**步骤**: -1. 提取 `_detect_account_context(tokens)` 方法 -2. 提取 `_normalize_account_spaces(line)` 为独立方法 -3. 提取 `_format_account(digits, context)` 为独立方法 - -**影响**: -- 减少 ~50-80 行重复代码 -- 提高可测试性 -- 低风险,易于验证 - -**预期结果**: 919 行 → ~850 行 (↓7%) - -### 方案 B: 中度重构 - -**目标**: 创建通用的字段提取框架 - -**步骤**: -1. 创建 `_generic_extract(pattern, normalizer, context_checker)` -2. 重构所有 `_extract_*` 方法使用通用框架 -3. 拆分 `_parse_standard_payment_line` 为多个小方法 - -**影响**: -- 减少 ~150-200 行代码 -- 显著提高可维护性 -- 中等风险,需要全面测试 - -**预期结果**: 919 行 → ~720 行 (↓22%) - -### 方案 C: 深度重构(不推荐) - -**目标**: 完全重新设计为策略模式 - -**风险**: -- 高风险,可能引入 bugs -- 需要大量测试 -- 可能破坏现有集成 - -## 推荐方案 - -### ✅ 采用方案 A(轻度重构) - -**理由**: -1. **代码已经工作良好**: 没有明显的 bug 或性能问题 -2. **低风险**: 只提取重复逻辑,不改变核心算法 -3. **性价比高**: 小改动带来明显的代码质量提升 -4. **易于验证**: 现有测试应该能覆盖 - -### 重构步骤 - -```python -# 1. 提取上下文检测 -def _detect_account_context(self, tokens: list[TextToken]) -> dict[str, bool]: - """检测上下文中的账户类型关键词""" - context_text = ' '.join(t.text.lower() for t in tokens) - - return { - 'bankgiro': any(kw in context_text for kw in ['bankgiro', 'bg:', 'bg ']), - 'plusgiro': any(kw in context_text for kw in ['plusgiro', 'postgiro', 'plusgirokonto', 'pg:', 'pg ']), - } - -# 2. 提取空格标准化 -def _normalize_account_spaces(self, line: str) -> str: - """移除账户号码中的空格""" - # (现有 line 460-481 的代码) - -# 3. 提取账户格式化 -def _format_account( - self, - account_digits: str, - is_plusgiro_context: bool -) -> tuple[str, str]: - """格式化账户并确定类型""" - # (现有 line 485-523 的代码) -``` - -## 对比:field_extractor vs machine_code_parser - -| 特征 | field_extractor | machine_code_parser | -|------|-----------------|---------------------| -| 用途 | 值提取 | 机器码解析 | -| 重复代码 | ~400行normalize方法 | ~80行上下文检测 | -| 重构价值 | ❌ 不同用途,不应统一 | ✅ 可提取共享逻辑 | -| 风险 | 高(会破坏功能) | 低(只是代码组织) | - -## 决策 - -### ✅ 建议重构 machine_code_parser.py - -**与 field_extractor 的不同**: -- field_extractor: 重复的方法有**不同的用途**(提取 vs 变体生成) -- machine_code_parser: 重复的代码有**相同的用途**(都是上下文检测) - -**预期收益**: -- 减少 ~70 行重复代码 -- 提高可测试性(可以单独测试上下文检测) -- 更清晰的代码组织 -- **低风险**,易于验证 - -## 下一步 - -1. ✅ 备份原文件 -2. ✅ 提取 `_detect_account_context` 方法 -3. ✅ 提取 `_normalize_account_spaces` 方法 -4. ✅ 提取 `_format_account` 方法 -5. ✅ 更新所有调用点 -6. ✅ 运行测试验证 -7. ✅ 检查代码覆盖率 - ---- - -**状态**: 📋 分析完成,建议轻度重构 -**风险评估**: 🟢 低风险 -**预期收益**: 919行 → ~850行 (↓7%) diff --git a/docs/PERFORMANCE_OPTIMIZATION.md b/docs/PERFORMANCE_OPTIMIZATION.md deleted file mode 100644 index 1fc1626..0000000 --- a/docs/PERFORMANCE_OPTIMIZATION.md +++ /dev/null @@ -1,519 +0,0 @@ -# Performance Optimization Guide - -This document provides performance optimization recommendations for the Invoice Field Extraction system. - -## Table of Contents - -1. [Batch Processing Optimization](#batch-processing-optimization) -2. [Database Query Optimization](#database-query-optimization) -3. [Caching Strategies](#caching-strategies) -4. [Memory Management](#memory-management) -5. [Profiling and Monitoring](#profiling-and-monitoring) - ---- - -## Batch Processing Optimization - -### Current State - -The system processes invoices one at a time. For large batches, this can be inefficient. - -### Recommendations - -#### 1. Database Batch Operations - -**Current**: Individual inserts for each document -```python -# Inefficient -for doc in documents: - db.insert_document(doc) # Individual DB call -``` - -**Optimized**: Use `execute_values` for batch inserts -```python -# Efficient - already implemented in db.py line 519 -from psycopg2.extras import execute_values - -execute_values(cursor, """ - INSERT INTO documents (...) - VALUES %s -""", document_values) -``` - -**Impact**: 10-50x faster for batches of 100+ documents - -#### 2. PDF Processing Batching - -**Recommendation**: Process PDFs in parallel using multiprocessing - -```python -from multiprocessing import Pool - -def process_batch(pdf_paths, batch_size=10): - """Process PDFs in parallel batches.""" - with Pool(processes=batch_size) as pool: - results = pool.map(pipeline.process_pdf, pdf_paths) - return results -``` - -**Considerations**: -- GPU models should use a shared process pool (already exists: `src/processing/gpu_pool.py`) -- CPU-intensive tasks can use separate process pool (`src/processing/cpu_pool.py`) -- Current dual pool coordinator (`dual_pool_coordinator.py`) already supports this pattern - -**Status**: ✅ Already implemented in `src/processing/` modules - -#### 3. Image Caching for Multi-Page PDFs - -**Current**: Each page rendered independently -```python -# Current pattern in field_extractor.py -for page_num in range(total_pages): - image = render_pdf_page(pdf_path, page_num, dpi=300) -``` - -**Optimized**: Pre-render all pages if processing multiple fields per page -```python -# Batch render -images = { - page_num: render_pdf_page(pdf_path, page_num, dpi=300) - for page_num in page_numbers_needed -} - -# Reuse images -for detection in detections: - image = images[detection.page_no] - extract_field(detection, image) -``` - -**Impact**: Reduces redundant PDF rendering by 50-90% for multi-field invoices - ---- - -## Database Query Optimization - -### Current Performance - -- **Parameterized queries**: ✅ Implemented (Phase 1) -- **Connection pooling**: ❌ Not implemented -- **Query batching**: ✅ Partially implemented -- **Index optimization**: ⚠️ Needs verification - -### Recommendations - -#### 1. Connection Pooling - -**Current**: New connection for each operation -```python -def connect(self): - """Create new database connection.""" - return psycopg2.connect(**self.config) -``` - -**Optimized**: Use connection pooling -```python -from psycopg2 import pool - -class DocumentDatabase: - def __init__(self, config): - self.pool = pool.SimpleConnectionPool( - minconn=1, - maxconn=10, - **config - ) - - def connect(self): - return self.pool.getconn() - - def close(self, conn): - self.pool.putconn(conn) -``` - -**Impact**: -- Reduces connection overhead by 80-95% -- Especially important for high-frequency operations - -#### 2. Index Recommendations - -**Check current indexes**: -```sql --- Verify indexes exist on frequently queried columns -SELECT tablename, indexname, indexdef -FROM pg_indexes -WHERE schemaname = 'public'; -``` - -**Recommended indexes**: -```sql --- If not already present -CREATE INDEX IF NOT EXISTS idx_documents_success - ON documents(success); - -CREATE INDEX IF NOT EXISTS idx_documents_timestamp - ON documents(timestamp DESC); - -CREATE INDEX IF NOT EXISTS idx_field_results_document_id - ON field_results(document_id); - -CREATE INDEX IF NOT EXISTS idx_field_results_matched - ON field_results(matched); - -CREATE INDEX IF NOT EXISTS idx_field_results_field_name - ON field_results(field_name); -``` - -**Impact**: -- 10-100x faster queries for filtered/sorted results -- Critical for `get_failed_matches()` and `get_all_documents_summary()` - -#### 3. Query Batching - -**Status**: ✅ Already implemented for field results (line 519) - -**Verify batching is used**: -```python -# Good pattern in db.py -execute_values(cursor, "INSERT INTO field_results (...) VALUES %s", field_values) -``` - -**Additional opportunity**: Batch `SELECT` queries -```python -# Current -docs = [get_document(doc_id) for doc_id in doc_ids] # N queries - -# Optimized -docs = get_documents_batch(doc_ids) # 1 query with IN clause -``` - -**Status**: ✅ Already implemented (`get_documents_batch` exists in db.py) - ---- - -## Caching Strategies - -### 1. Model Loading Cache - -**Current**: Models loaded per-instance - -**Recommendation**: Singleton pattern for YOLO model -```python -class YOLODetectorSingleton: - _instance = None - _model = None - - @classmethod - def get_instance(cls, model_path): - if cls._instance is None: - cls._instance = YOLODetector(model_path) - return cls._instance -``` - -**Impact**: Reduces memory usage by 90% when processing multiple documents - -### 2. Parser Instance Caching - -**Current**: ✅ Already optimal -```python -# Good pattern in field_extractor.py -def __init__(self): - self.payment_line_parser = PaymentLineParser() # Reused - self.customer_number_parser = CustomerNumberParser() # Reused -``` - -**Status**: No changes needed - -### 3. OCR Result Caching - -**Recommendation**: Cache OCR results for identical regions -```python -from functools import lru_cache - -@lru_cache(maxsize=1000) -def ocr_region_cached(image_hash, bbox): - """Cache OCR results by image hash + bbox.""" - return paddle_ocr.ocr_region(image, bbox) -``` - -**Impact**: 50-80% speedup when re-processing similar documents - -**Note**: Requires implementing image hashing (e.g., `hashlib.md5(image.tobytes())`) - ---- - -## Memory Management - -### Current Issues - -**Potential memory leaks**: -1. Large images kept in memory after processing -2. OCR results accumulated without cleanup -3. Model outputs not explicitly cleared - -### Recommendations - -#### 1. Explicit Image Cleanup - -```python -import gc - -def process_pdf(pdf_path): - try: - image = render_pdf(pdf_path) - result = extract_fields(image) - return result - finally: - del image # Explicit cleanup - gc.collect() # Force garbage collection -``` - -#### 2. Generator Pattern for Large Batches - -**Current**: Load all documents into memory -```python -docs = [process_pdf(path) for path in pdf_paths] # All in memory -``` - -**Optimized**: Use generator for streaming processing -```python -def process_batch_streaming(pdf_paths): - """Process documents one at a time, yielding results.""" - for path in pdf_paths: - result = process_pdf(path) - yield result - # Result can be saved to DB immediately - # Previous result is garbage collected -``` - -**Impact**: Constant memory usage regardless of batch size - -#### 3. Context Managers for Resources - -```python -class InferencePipeline: - def __enter__(self): - self.detector.load_model() - return self - - def __exit__(self, *args): - self.detector.unload_model() - self.extractor.cleanup() - -# Usage -with InferencePipeline(...) as pipeline: - results = pipeline.process_pdf(path) -# Automatic cleanup -``` - ---- - -## Profiling and Monitoring - -### Recommended Profiling Tools - -#### 1. cProfile for CPU Profiling - -```python -import cProfile -import pstats - -profiler = cProfile.Profile() -profiler.enable() - -# Your code here -pipeline.process_pdf(pdf_path) - -profiler.disable() -stats = pstats.Stats(profiler) -stats.sort_stats('cumulative') -stats.print_stats(20) # Top 20 slowest functions -``` - -#### 2. memory_profiler for Memory Analysis - -```bash -pip install memory_profiler -python -m memory_profiler your_script.py -``` - -Or decorator-based: -```python -from memory_profiler import profile - -@profile -def process_large_batch(pdf_paths): - # Memory usage tracked line-by-line - results = [process_pdf(path) for path in pdf_paths] - return results -``` - -#### 3. py-spy for Production Profiling - -```bash -pip install py-spy - -# Profile running process -py-spy top --pid 12345 - -# Generate flamegraph -py-spy record -o profile.svg -- python your_script.py -``` - -**Advantage**: No code changes needed, minimal overhead - -### Key Metrics to Monitor - -1. **Processing Time per Document** - - Target: <10 seconds for single-page invoice - - Current: ~2-5 seconds (estimated) - -2. **Memory Usage** - - Target: <2GB for batch of 100 documents - - Monitor: Peak memory usage - -3. **Database Query Time** - - Target: <100ms per query (with indexes) - - Monitor: Slow query log - -4. **OCR Accuracy vs Speed Trade-off** - - Current: PaddleOCR with GPU (~200ms per region) - - Alternative: Tesseract (~500ms, slightly more accurate) - -### Logging Performance Metrics - -**Add to pipeline.py**: -```python -import time -import logging - -logger = logging.getLogger(__name__) - -def process_pdf(self, pdf_path): - start = time.time() - - # Processing... - result = self._process_internal(pdf_path) - - elapsed = time.time() - start - logger.info(f"Processed {pdf_path} in {elapsed:.2f}s") - - # Log to database for analysis - self.db.log_performance({ - 'document_id': result.document_id, - 'processing_time': elapsed, - 'field_count': len(result.fields) - }) - - return result -``` - ---- - -## Performance Optimization Priorities - -### High Priority (Implement First) - -1. ✅ **Database parameterized queries** - Already done (Phase 1) -2. ⚠️ **Database connection pooling** - Not implemented -3. ⚠️ **Index optimization** - Needs verification - -### Medium Priority - -4. ⚠️ **Batch PDF rendering** - Optimization possible -5. ✅ **Parser instance reuse** - Already done (Phase 2) -6. ⚠️ **Model caching** - Could improve - -### Low Priority (Nice to Have) - -7. ⚠️ **OCR result caching** - Complex implementation -8. ⚠️ **Generator patterns** - Refactoring needed -9. ⚠️ **Advanced profiling** - For production optimization - ---- - -## Benchmarking Script - -```python -""" -Benchmark script for invoice processing performance. -""" - -import time -from pathlib import Path -from src.inference.pipeline import InferencePipeline - -def benchmark_single_document(pdf_path, iterations=10): - """Benchmark single document processing.""" - pipeline = InferencePipeline( - model_path="path/to/model.pt", - use_gpu=True - ) - - times = [] - for i in range(iterations): - start = time.time() - result = pipeline.process_pdf(pdf_path) - elapsed = time.time() - start - times.append(elapsed) - print(f"Iteration {i+1}: {elapsed:.2f}s") - - avg_time = sum(times) / len(times) - print(f"\nAverage: {avg_time:.2f}s") - print(f"Min: {min(times):.2f}s") - print(f"Max: {max(times):.2f}s") - -def benchmark_batch(pdf_paths, batch_size=10): - """Benchmark batch processing.""" - from multiprocessing import Pool - - pipeline = InferencePipeline( - model_path="path/to/model.pt", - use_gpu=True - ) - - start = time.time() - - with Pool(processes=batch_size) as pool: - results = pool.map(pipeline.process_pdf, pdf_paths) - - elapsed = time.time() - start - avg_per_doc = elapsed / len(pdf_paths) - - print(f"Total time: {elapsed:.2f}s") - print(f"Documents: {len(pdf_paths)}") - print(f"Average per document: {avg_per_doc:.2f}s") - print(f"Throughput: {len(pdf_paths)/elapsed:.2f} docs/sec") - -if __name__ == "__main__": - # Single document benchmark - benchmark_single_document("test.pdf") - - # Batch benchmark - pdf_paths = list(Path("data/test_pdfs").glob("*.pdf")) - benchmark_batch(pdf_paths[:100]) -``` - ---- - -## Summary - -**Implemented (Phase 1-2)**: -- ✅ Parameterized queries (SQL injection fix) -- ✅ Parser instance reuse (Phase 2 refactoring) -- ✅ Batch insert operations (execute_values) -- ✅ Dual pool processing (CPU/GPU separation) - -**Quick Wins (Low effort, high impact)**: -- Database connection pooling (2-4 hours) -- Index verification and optimization (1-2 hours) -- Batch PDF rendering (4-6 hours) - -**Long-term Improvements**: -- OCR result caching with hashing -- Generator patterns for streaming -- Advanced profiling and monitoring - -**Expected Impact**: -- Connection pooling: 80-95% reduction in DB overhead -- Indexes: 10-100x faster queries -- Batch rendering: 50-90% less redundant work -- **Overall**: 2-5x throughput improvement for batch processing diff --git a/docs/REFACTORING_PLAN.md b/docs/REFACTORING_PLAN.md deleted file mode 100644 index 194e0c5..0000000 --- a/docs/REFACTORING_PLAN.md +++ /dev/null @@ -1,1447 +0,0 @@ -# 重构计划文档 (Refactoring Plan) - -**项目**: Invoice Field Extraction System -**生成日期**: 2026-01-22 -**基于**: CODE_REVIEW_REPORT.md -**目标**: 提升代码可维护性、可测试性和安全性 - ---- - -## 📋 目录 - -1. [重构目标](#重构目标) -2. [总体策略](#总体策略) -3. [三阶段执行计划](#三阶段执行计划) -4. [详细重构步骤](#详细重构步骤) -5. [测试策略](#测试策略) -6. [风险管理](#风险管理) -7. [成功指标](#成功指标) - ---- - -## 🎯 重构目标 - -### 主要目标 -1. **安全性**: 消除明文密码、SQL注入等安全隐患 -2. **可维护性**: 减少代码重复,降低函数复杂度 -3. **可测试性**: 提升测试覆盖率至70%+,增加集成测试 -4. **可读性**: 统一代码风格,添加必要文档 -5. **性能**: 优化批处理和并发处理 - -### 量化指标 -- 测试覆盖率: 45% → 70%+ -- 平均函数长度: 80行 → 50行以下 -- 代码重复率: 15% → 5%以下 -- 循环复杂度: 最高15+ → 最高10 -- 关键函数文档覆盖: 30% → 80%+ - ---- - -## 📐 总体策略 - -### 原则 -1. **增量重构**: 小步快跑,每次重构保持系统可运行 -2. **测试先行**: 重构前先补充测试,确保行为不变 -3. **向后兼容**: API接口保持兼容,逐步废弃旧接口 -4. **文档同步**: 代码变更同步更新文档 - -### 工作流程 -``` -1. 为待重构模块补充测试 (确保现有行为被覆盖) - ↓ -2. 执行重构 (Extract Method, Extract Class, etc.) - ↓ -3. 运行全量测试 (确保行为不变) - ↓ -4. 更新文档 - ↓ -5. Code Review - ↓ -6. 合并主分支 -``` - ---- - -## 🗓️ 三阶段执行计划 - -### Phase 1: 紧急修复 (1周) -**目标**: 修复安全漏洞和关键bug - -| 任务 | 优先级 | 预计时间 | 负责模块 | -|------|--------|----------|----------| -| 修复明文密码问题 | P0 | 1小时 | `src/db/config.py` | -| 配置环境变量管理 | P0 | 2小时 | 根目录 `.env` | -| 修复SQL注入风险 | P0 | 3小时 | `src/db/operations.py` | -| 添加输入验证 | P1 | 4小时 | `src/web/routes.py` | -| 异常处理规范化 | P1 | 1天 | 全局 | - -### Phase 2: 核心重构 (2-3周) -**目标**: 降低代码复杂度,消除重复 - -| 任务 | 优先级 | 预计时间 | 负责模块 | -|------|--------|----------|----------| -| 拆分 `_normalize_customer_number` | P0 | 1天 | `field_extractor.py` | -| 统一 payment_line 解析 | P0 | 2天 | 抽取到单独模块 | -| 重构 `process_document` | P1 | 2天 | `pipeline.py` | -| Extract Method: 长函数拆分 | P1 | 3天 | 全局 | -| 添加集成测试 | P0 | 3天 | `tests/integration/` | -| 提升单元测试覆盖率 | P1 | 2天 | 各模块 | - -### Phase 3: 优化改进 (1-2周) -**目标**: 性能优化、文档完善 - -| 任务 | 优先级 | 预计时间 | 负责模块 | -|------|--------|----------|----------| -| 批处理并发优化 | P1 | 2天 | `batch_processor.py` | -| API文档完善 | P2 | 1天 | `docs/API.md` | -| 配置提取到常量 | P2 | 1天 | `src/config/constants.py` | -| 日志系统优化 | P2 | 1天 | `src/utils/logging.py` | -| 性能分析和优化 | P2 | 2天 | 全局 | - ---- - -## 🔧 详细重构步骤 - -### Step 1: 修复明文密码 (P0, 1小时) - -**当前问题**: -```python -# src/db/config.py:29 -DATABASE_CONFIG = { - "host": "localhost", - "port": 3306, - "user": "root", - "password": "your_password", # ❌ 明文密码 - "database": "invoice_extraction", -} -``` - -**重构步骤**: - -1. 创建 `.env.example` 模板: -```bash -# Database Configuration -DB_HOST=localhost -DB_PORT=3306 -DB_USER=root -DB_PASSWORD=your_password_here -DB_NAME=invoice_extraction -``` - -2. 创建 `.env` 文件 (加入 `.gitignore`): -```bash -DB_PASSWORD=actual_secure_password -``` - -3. 修改 `src/db/config.py`: -```python -import os -from dotenv import load_dotenv - -load_dotenv() - -DATABASE_CONFIG = { - "host": os.getenv("DB_HOST", "localhost"), - "port": int(os.getenv("DB_PORT", "3306")), - "user": os.getenv("DB_USER", "root"), - "password": os.getenv("DB_PASSWORD"), # ✅ 从环境变量读取 - "database": os.getenv("DB_NAME", "invoice_extraction"), -} - -# 启动时验证 -if not DATABASE_CONFIG["password"]: - raise ValueError("DB_PASSWORD environment variable not set") -``` - -4. 安装依赖: -```bash -pip install python-dotenv -``` - -5. 更新 `requirements.txt`: -``` -python-dotenv>=1.0.0 -``` - -**测试**: -- 验证环境变量读取正常 -- 确认缺少环境变量时抛出异常 -- 测试数据库连接 - ---- - -### Step 2: 修复SQL注入 (P0, 3小时) - -**当前问题**: -```python -# src/db/operations.py:156 -query = f"SELECT * FROM documents WHERE id = {doc_id}" # ❌ SQL注入风险 -cursor.execute(query) -``` - -**重构步骤**: - -1. 审查所有SQL查询,识别字符串拼接: -```bash -grep -n "f\".*SELECT" src/db/operations.py -grep -n "f\".*INSERT" src/db/operations.py -grep -n "f\".*UPDATE" src/db/operations.py -grep -n "f\".*DELETE" src/db/operations.py -``` - -2. 替换为参数化查询: -```python -# Before -query = f"SELECT * FROM documents WHERE id = {doc_id}" -cursor.execute(query) - -# After ✅ -query = "SELECT * FROM documents WHERE id = %s" -cursor.execute(query, (doc_id,)) -``` - -3. 常见场景替换: -```python -# INSERT -query = "INSERT INTO documents (filename, status) VALUES (%s, %s)" -cursor.execute(query, (filename, status)) - -# UPDATE -query = "UPDATE documents SET status = %s WHERE id = %s" -cursor.execute(query, (new_status, doc_id)) - -# IN clause -placeholders = ','.join(['%s'] * len(ids)) -query = f"SELECT * FROM documents WHERE id IN ({placeholders})" -cursor.execute(query, ids) -``` - -4. 创建查询构建器辅助函数: -```python -# src/db/query_builder.py -def build_select(table: str, columns: list[str] = None, where: dict = None): - """Build safe SELECT query with parameters.""" - cols = ', '.join(columns) if columns else '*' - query = f"SELECT {cols} FROM {table}" - - params = [] - if where: - conditions = [] - for key, value in where.items(): - conditions.append(f"{key} = %s") - params.append(value) - query += " WHERE " + " AND ".join(conditions) - - return query, tuple(params) -``` - -**测试**: -- 单元测试所有修改的查询函数 -- SQL注入测试: 传入 `"1 OR 1=1"` 等恶意输入 -- 集成测试验证功能正常 - ---- - -### Step 3: 统一 payment_line 解析 (P0, 2天) - -**当前问题**: payment_line 解析逻辑在3个地方重复实现 -- `src/inference/field_extractor.py:632-705` (normalization) -- `src/inference/pipeline.py:217-252` (parsing for cross-validation) -- `src/inference/test_field_extractor.py:269-344` (test cases) - -**重构步骤**: - -1. 创建独立模块 `src/inference/payment_line_parser.py`: -```python -""" -Swedish Payment Line Parser - -Handles parsing and validation of Swedish machine-readable payment lines. -Format: # # <Öre> > ## -""" - -import re -from dataclasses import dataclass -from typing import Optional - - -@dataclass -class PaymentLineData: - """Parsed payment line data.""" - ocr_number: str - amount: str # Format: "KRONOR.ÖRE" - account_number: str # Bankgiro or Plusgiro - record_type: str # Usually "5" or "9" - check_digits: str - raw_text: str - is_valid: bool - error: Optional[str] = None - - -class PaymentLineParser: - """Parser for Swedish payment lines with OCR error handling.""" - - # Pattern with OCR error tolerance - FULL_PATTERN = re.compile( - r'#\s*(\d[\d\s]*)\s*#\s*([\d\s]+?)\s+(\d{2})\s+(\d)\s*>?\s*([\d\s]+)\s*#\s*(\d+)\s*#' - ) - - # Pattern without amount (fallback) - PARTIAL_PATTERN = re.compile( - r'#\s*(\d[\d\s]*)\s*#.*?(\d)\s*>?\s*([\d\s]+)\s*#\s*(\d+)\s*#' - ) - - def __init__(self): - self.logger = logging.getLogger(__name__) - - def parse(self, text: str) -> PaymentLineData: - """ - Parse payment line text. - - Handles common OCR errors: - - Spaces in numbers: "12 0 0" → "1200" - - Missing symbols: Missing ">" - - Spaces in check digits: "#41 #" → "#41#" - - Args: - text: Raw payment line text - - Returns: - PaymentLineData with parsed fields - """ - text = text.strip() - - # Try full pattern with amount - match = self.FULL_PATTERN.search(text) - if match: - return self._parse_full_match(match, text) - - # Try partial pattern without amount - match = self.PARTIAL_PATTERN.search(text) - if match: - return self._parse_partial_match(match, text) - - # No match - return PaymentLineData( - ocr_number="", - amount="", - account_number="", - record_type="", - check_digits="", - raw_text=text, - is_valid=False, - error="Invalid payment line format" - ) - - def _parse_full_match(self, match: re.Match, raw_text: str) -> PaymentLineData: - """Parse full pattern match (with amount).""" - ocr = self._clean_digits(match.group(1)) - kronor = self._clean_digits(match.group(2)) - ore = match.group(3) - record_type = match.group(4) - account = self._clean_digits(match.group(5)) - check_digits = match.group(6) - - amount = f"{kronor}.{ore}" - - return PaymentLineData( - ocr_number=ocr, - amount=amount, - account_number=account, - record_type=record_type, - check_digits=check_digits, - raw_text=raw_text, - is_valid=True - ) - - def _parse_partial_match(self, match: re.Match, raw_text: str) -> PaymentLineData: - """Parse partial pattern match (without amount).""" - ocr = self._clean_digits(match.group(1)) - record_type = match.group(2) - account = self._clean_digits(match.group(3)) - check_digits = match.group(4) - - return PaymentLineData( - ocr_number=ocr, - amount="", # No amount in partial format - account_number=account, - record_type=record_type, - check_digits=check_digits, - raw_text=raw_text, - is_valid=True - ) - - def _clean_digits(self, text: str) -> str: - """Remove spaces from digit string.""" - return text.replace(' ', '') - - def format_machine_readable(self, data: PaymentLineData) -> str: - """ - Format parsed data back to machine-readable format. - - Returns: - Formatted string: "# OCR # KRONOR ÖRE TYPE > ACCOUNT#CHECK#" - """ - if not data.is_valid: - return data.raw_text - - if data.amount: - kronor, ore = data.amount.split('.') - return ( - f"# {data.ocr_number} # {kronor} {ore} {data.record_type} > " - f"{data.account_number}#{data.check_digits}#" - ) - else: - return ( - f"# {data.ocr_number} # ... {data.record_type} > " - f"{data.account_number}#{data.check_digits}#" - ) -``` - -2. 重构 `field_extractor.py` 使用新parser: -```python -# src/inference/field_extractor.py -from .payment_line_parser import PaymentLineParser - -class FieldExtractor: - def __init__(self): - self.payment_parser = PaymentLineParser() - # ... - - def _normalize_payment_line(self, text: str) -> tuple[str | None, bool, str | None]: - """Normalize payment line using dedicated parser.""" - data = self.payment_parser.parse(text) - - if not data.is_valid: - return None, False, data.error - - formatted = self.payment_parser.format_machine_readable(data) - return formatted, True, None -``` - -3. 重构 `pipeline.py` 使用新parser: -```python -# src/inference/pipeline.py -from .payment_line_parser import PaymentLineParser - -class InferencePipeline: - def __init__(self): - self.payment_parser = PaymentLineParser() - # ... - - def _parse_machine_readable_payment_line( - self, payment_line: str - ) -> tuple[str | None, str | None, str | None]: - """Parse payment line for cross-validation.""" - data = self.payment_parser.parse(payment_line) - - if not data.is_valid: - return None, None, None - - return data.ocr_number, data.amount, data.account_number -``` - -4. 更新测试使用新parser: -```python -# tests/unit/test_payment_line_parser.py -from src.inference.payment_line_parser import PaymentLineParser - -class TestPaymentLineParser: - def test_full_format_with_spaces(self): - """Test parsing with OCR-induced spaces.""" - parser = PaymentLineParser() - text = "# 6026726908 # 736 00 9 > 5692041 #41 #" - - data = parser.parse(text) - - assert data.is_valid - assert data.ocr_number == "6026726908" - assert data.amount == "736.00" - assert data.account_number == "5692041" - assert data.check_digits == "41" - - def test_format_without_amount(self): - """Test parsing without amount.""" - parser = PaymentLineParser() - text = "# 11000770600242 # ... 5 > 3082963#41#" - - data = parser.parse(text) - - assert data.is_valid - assert data.ocr_number == "11000770600242" - assert data.amount == "" - assert data.account_number == "3082963" - - def test_machine_readable_format(self): - """Test formatting back to machine-readable.""" - parser = PaymentLineParser() - text = "# 6026726908 # 736 00 9 > 5692041 #41 #" - - data = parser.parse(text) - formatted = parser.format_machine_readable(data) - - assert "# 6026726908 #" in formatted - assert "736 00" in formatted - assert "5692041#41#" in formatted -``` - -**迁移步骤**: -1. 创建 `payment_line_parser.py` 并添加测试 -2. 运行测试确保新实现正确 -3. 逐个文件迁移到新parser -4. 每次迁移后运行全量测试 -5. 删除旧实现代码 -6. 更新文档 - -**测试**: -- 单元测试覆盖所有解析场景 -- 集成测试验证端到端功能 -- 回归测试确保行为不变 - ---- - -### Step 4: 拆分 `_normalize_customer_number` (P0, 1天) - -**当前问题**: -- 函数长度: 127行 -- 循环复杂度: 15+ -- 职责过多: 模式匹配、格式化、验证混在一起 - -**重构策略**: Extract Method + Strategy Pattern - -**重构步骤**: - -1. 创建 `src/inference/customer_number_parser.py`: -```python -""" -Customer Number Parser - -Handles extraction and normalization of Swedish customer numbers. -""" - -import re -from abc import ABC, abstractmethod -from dataclasses import dataclass -from typing import Optional - - -@dataclass -class CustomerNumberMatch: - """Customer number match result.""" - value: str - pattern_name: str - confidence: float - raw_text: str - - -class CustomerNumberPattern(ABC): - """Abstract base for customer number patterns.""" - - @abstractmethod - def match(self, text: str) -> Optional[CustomerNumberMatch]: - """Try to match pattern in text.""" - pass - - @abstractmethod - def format(self, match: re.Match) -> str: - """Format matched groups to standard format.""" - pass - - -class DashFormatPattern(CustomerNumberPattern): - """Pattern: ABC 123-X""" - - PATTERN = re.compile(r'\b([A-Za-z]{2,4})\s+(\d{1,4})-([A-Za-z0-9])\b') - - def match(self, text: str) -> Optional[CustomerNumberMatch]: - match = self.PATTERN.search(text) - if not match: - return None - - formatted = self.format(match) - return CustomerNumberMatch( - value=formatted, - pattern_name="DashFormat", - confidence=0.95, - raw_text=match.group(0) - ) - - def format(self, match: re.Match) -> str: - prefix = match.group(1).upper() - number = match.group(2) - suffix = match.group(3).upper() - return f"{prefix} {number}-{suffix}" - - -class NoDashFormatPattern(CustomerNumberPattern): - """Pattern: ABC 123X (no dash)""" - - PATTERN = re.compile(r'\b([A-Za-z]{2,4})\s+(\d{2,4})([A-Za-z])\b') - - def match(self, text: str) -> Optional[CustomerNumberMatch]: - match = self.PATTERN.search(text) - if not match: - return None - - # Exclude postal codes - full_text = match.group(0) - if self._is_postal_code(full_text): - return None - - formatted = self.format(match) - return CustomerNumberMatch( - value=formatted, - pattern_name="NoDashFormat", - confidence=0.90, - raw_text=full_text - ) - - def format(self, match: re.Match) -> str: - prefix = match.group(1).upper() - number = match.group(2) - suffix = match.group(3).upper() - return f"{prefix} {number}-{suffix}" - - def _is_postal_code(self, text: str) -> bool: - """Check if text looks like Swedish postal code.""" - # SE 106 43, SE 10643, etc. - return bool(re.match(r'^SE\s*\d{3}\s*\d{2}', text, re.IGNORECASE)) - - -class CustomerNumberParser: - """Parser for Swedish customer numbers.""" - - def __init__(self): - # Patterns ordered by specificity (most specific first) - self.patterns: list[CustomerNumberPattern] = [ - DashFormatPattern(), - NoDashFormatPattern(), - # Add more patterns as needed - ] - self.logger = logging.getLogger(__name__) - - def parse(self, text: str) -> tuple[Optional[str], bool, Optional[str]]: - """ - Parse customer number from text. - - Returns: - (customer_number, is_valid, error) - """ - text = text.strip() - - # Try each pattern - matches: list[CustomerNumberMatch] = [] - for pattern in self.patterns: - match = pattern.match(text) - if match: - matches.append(match) - - # No matches - if not matches: - return None, False, "No customer number found" - - # Return highest confidence match - best_match = max(matches, key=lambda m: m.confidence) - return best_match.value, True, None - - def parse_all(self, text: str) -> list[CustomerNumberMatch]: - """ - Find all customer numbers in text. - - Useful for cases with multiple potential matches. - """ - matches: list[CustomerNumberMatch] = [] - for pattern in self.patterns: - match = pattern.match(text) - if match: - matches.append(match) - return sorted(matches, key=lambda m: m.confidence, reverse=True) -``` - -2. 重构 `field_extractor.py`: -```python -# src/inference/field_extractor.py -from .customer_number_parser import CustomerNumberParser - -class FieldExtractor: - def __init__(self): - self.customer_parser = CustomerNumberParser() - # ... - - def _normalize_customer_number( - self, text: str - ) -> tuple[str | None, bool, str | None]: - """Normalize customer number using dedicated parser.""" - return self.customer_parser.parse(text) -``` - -3. 添加测试: -```python -# tests/unit/test_customer_number_parser.py -from src.inference.customer_number_parser import ( - CustomerNumberParser, - DashFormatPattern, - NoDashFormatPattern, -) - -class TestDashFormatPattern: - def test_standard_format(self): - pattern = DashFormatPattern() - match = pattern.match("Customer: JTY 576-3") - - assert match is not None - assert match.value == "JTY 576-3" - assert match.confidence == 0.95 - -class TestNoDashFormatPattern: - def test_no_dash_format(self): - pattern = NoDashFormatPattern() - match = pattern.match("Dwq 211X") - - assert match is not None - assert match.value == "DWQ 211-X" - assert match.confidence == 0.90 - - def test_exclude_postal_code(self): - pattern = NoDashFormatPattern() - match = pattern.match("SE 106 43") - - assert match is None # Should be filtered out - -class TestCustomerNumberParser: - def test_parse_with_dash(self): - parser = CustomerNumberParser() - result, is_valid, error = parser.parse("Customer: JTY 576-3") - - assert is_valid - assert result == "JTY 576-3" - assert error is None - - def test_parse_without_dash(self): - parser = CustomerNumberParser() - result, is_valid, error = parser.parse("Dwq 211X Billo") - - assert is_valid - assert result == "DWQ 211-X" - - def test_parse_all_finds_multiple(self): - parser = CustomerNumberParser() - text = "JTY 576-3 and DWQ 211X" - matches = parser.parse_all(text) - - assert len(matches) >= 1 # At least one match - assert matches[0].confidence >= 0.90 -``` - -**迁移计划**: -1. Day 1 上午: 创建新parser和测试 -2. Day 1 下午: 迁移 `field_extractor.py`,运行测试 -3. 回归测试确保所有文档处理正常 - ---- - -### Step 5: 重构 `process_document` (P1, 2天) - -**当前问题**: `pipeline.py:100-250` (150行) 职责过多 - -**重构策略**: Extract Method + 责任分离 - -**目标结构**: -```python -def process_document(self, image_path: Path, document_id: str) -> DocumentResult: - """Main orchestration - keep under 30 lines.""" - # 1. Run detection - detections = self._run_yolo_detection(image_path) - - # 2. Extract fields - fields = self._extract_fields_from_detections(detections, image_path) - - # 3. Apply cross-validation - fields = self._apply_cross_validation(fields) - - # 4. Multi-source fusion - fields = self._apply_multi_source_fusion(fields) - - # 5. Build result - return self._build_document_result(document_id, fields, detections) -``` - -详细步骤见 `docs/CODE_REVIEW_REPORT.md` Section 5.3. - ---- - -### Step 6: 添加集成测试 (P0, 3天) - -**当前状况**: 缺少端到端集成测试 - -**目标**: 创建完整的集成测试套件 - -**测试场景**: -1. PDF → 推理 → 结果验证 (端到端) -2. 批处理多文档 -3. API端点测试 -4. 数据库集成测试 -5. 错误场景测试 - -**实施步骤**: - -1. 创建测试数据集: -``` -tests/ -├── fixtures/ -│ ├── sample_invoices/ -│ │ ├── billo_363.pdf -│ │ ├── billo_308.pdf -│ │ └── billo_310.pdf -│ └── expected_results/ -│ ├── billo_363.json -│ ├── billo_308.json -│ └── billo_310.json -``` - -2. 创建 `tests/integration/test_end_to_end.py`: -```python -import pytest -from pathlib import Path -from src.inference.pipeline import InferencePipeline -from src.inference.field_extractor import FieldExtractor - - -@pytest.fixture -def pipeline(): - """Create inference pipeline.""" - extractor = FieldExtractor() - return InferencePipeline( - model_path="runs/train/invoice_fields/weights/best.pt", - confidence_threshold=0.5, - dpi=150, - field_extractor=extractor - ) - - -@pytest.fixture -def sample_invoices(): - """Load sample invoices and expected results.""" - fixtures_dir = Path(__file__).parent.parent / "fixtures" - samples = [] - - for pdf_path in (fixtures_dir / "sample_invoices").glob("*.pdf"): - json_path = fixtures_dir / "expected_results" / f"{pdf_path.stem}.json" - - with open(json_path) as f: - expected = json.load(f) - - samples.append({ - "pdf_path": pdf_path, - "expected": expected - }) - - return samples - - -class TestEndToEnd: - """End-to-end integration tests.""" - - def test_single_document_processing(self, pipeline, sample_invoices): - """Test processing a single invoice from PDF to extracted fields.""" - sample = sample_invoices[0] - - # Process PDF - result = pipeline.process_pdf( - sample["pdf_path"], - document_id="test_001" - ) - - # Verify success - assert result.success - - # Verify extracted fields match expected - expected = sample["expected"] - assert result.fields["amount"] == expected["amount"] - assert result.fields["ocr_number"] == expected["ocr_number"] - assert result.fields["customer_number"] == expected["customer_number"] - - def test_batch_processing(self, pipeline, sample_invoices): - """Test batch processing multiple invoices.""" - pdf_paths = [s["pdf_path"] for s in sample_invoices] - - # Process batch - results = pipeline.process_batch(pdf_paths) - - # Verify all processed - assert len(results) == len(pdf_paths) - - # Verify success rate - success_count = sum(1 for r in results if r.success) - assert success_count >= len(pdf_paths) * 0.9 # At least 90% success - - def test_cross_validation_overrides(self, pipeline): - """Test that payment_line values override detected values.""" - # Use sample with known discrepancy (Billo310) - pdf_path = Path("tests/fixtures/sample_invoices/billo_310.pdf") - - result = pipeline.process_pdf(pdf_path, document_id="test_cross_val") - - # Verify payment_line was parsed - assert "payment_line" in result.fields - - # Verify Amount was corrected from payment_line - # (Billo310: detected 20736.00, payment_line has 736.00) - assert result.fields["amount"] == "736.00" - - def test_error_handling_invalid_pdf(self, pipeline): - """Test graceful error handling for invalid PDF.""" - invalid_pdf = Path("tests/fixtures/invalid.pdf") - - result = pipeline.process_pdf(invalid_pdf, document_id="test_error") - - # Should return result with success=False - assert not result.success - assert result.errors - assert len(result.errors) > 0 - - -class TestAPIIntegration: - """API endpoint integration tests.""" - - @pytest.fixture - def client(self): - """Create test client.""" - from fastapi.testclient import TestClient - from src.web.app import create_app - from src.web.config import AppConfig - - config = AppConfig.from_defaults() - app = create_app(config) - return TestClient(app) - - def test_health_endpoint(self, client): - """Test /api/v1/health endpoint.""" - response = client.get("/api/v1/health") - - assert response.status_code == 200 - data = response.json() - assert data["status"] == "healthy" - assert "model_loaded" in data - - def test_infer_endpoint_with_pdf(self, client, sample_invoices): - """Test /api/v1/infer with PDF upload.""" - sample = sample_invoices[0] - - with open(sample["pdf_path"], "rb") as f: - response = client.post( - "/api/v1/infer", - files={"file": ("test.pdf", f, "application/pdf")} - ) - - assert response.status_code == 200 - data = response.json() - assert data["status"] == "success" - assert "result" in data - assert "fields" in data["result"] - - def test_infer_endpoint_invalid_file(self, client): - """Test /api/v1/infer rejects invalid file.""" - response = client.post( - "/api/v1/infer", - files={"file": ("test.txt", b"invalid", "text/plain")} - ) - - assert response.status_code == 400 - assert "Unsupported file type" in response.json()["detail"] - - -class TestDatabaseIntegration: - """Database integration tests.""" - - @pytest.fixture - def db_connection(self): - """Create test database connection.""" - from src.db.connection import DatabaseConnection - - # Use test database - conn = DatabaseConnection(database="invoice_extraction_test") - yield conn - conn.close() - - def test_save_and_retrieve_result(self, db_connection, pipeline, sample_invoices): - """Test saving inference result to database and retrieving it.""" - sample = sample_invoices[0] - - # Process document - result = pipeline.process_pdf(sample["pdf_path"], document_id="test_db_001") - - # Save to database - db_connection.save_inference_result(result) - - # Retrieve from database - retrieved = db_connection.get_inference_result("test_db_001") - - # Verify - assert retrieved is not None - assert retrieved["document_id"] == "test_db_001" - assert retrieved["fields"]["amount"] == result.fields["amount"] -``` - -3. 配置 pytest 运行集成测试: -```ini -# pytest.ini -[pytest] -markers = - unit: Unit tests (fast, no external dependencies) - integration: Integration tests (slower, may use database/files) - slow: Slow tests - -# Run unit tests by default -addopts = -v -m "not integration" - -# Run all tests including integration -# pytest -m "" -# Run only integration tests -# pytest -m integration -``` - -4. CI/CD集成: -```yaml -# .github/workflows/test.yml -name: Tests - -on: [push, pull_request] - -jobs: - unit-tests: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.11' - - name: Install dependencies - run: | - pip install -r requirements.txt - pip install pytest pytest-cov - - name: Run unit tests - run: pytest -m "not integration" --cov=src --cov-report=xml - - name: Upload coverage - uses: codecov/codecov-action@v3 - with: - file: ./coverage.xml - - integration-tests: - runs-on: ubuntu-latest - services: - mysql: - image: mysql:8.0 - env: - MYSQL_ROOT_PASSWORD: test_password - MYSQL_DATABASE: invoice_extraction_test - ports: - - 3306:3306 - steps: - - uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.11' - - name: Install dependencies - run: | - pip install -r requirements.txt - pip install pytest - - name: Run integration tests - env: - DB_HOST: localhost - DB_PORT: 3306 - DB_USER: root - DB_PASSWORD: test_password - DB_NAME: invoice_extraction_test - run: pytest -m integration -``` - -**时间分配**: -- Day 1: 准备测试数据、创建测试框架 -- Day 2: 编写端到端和API测试 -- Day 3: 数据库集成测试、CI/CD配置 - ---- - -### Step 7: 异常处理规范化 (P1, 1天) - -**当前问题**: 31处 `except Exception` 捕获过于宽泛 - -**目标**: 创建异常层次结构,精确捕获 - -**实施步骤**: - -1. 创建 `src/exceptions.py`: -```python -""" -Application-specific exceptions. -""" - - -class InvoiceExtractionError(Exception): - """Base exception for invoice extraction errors.""" - pass - - -class PDFProcessingError(InvoiceExtractionError): - """Error during PDF processing.""" - pass - - -class OCRError(InvoiceExtractionError): - """Error during OCR.""" - pass - - -class ModelInferenceError(InvoiceExtractionError): - """Error during model inference.""" - pass - - -class FieldValidationError(InvoiceExtractionError): - """Error during field validation.""" - pass - - -class DatabaseError(InvoiceExtractionError): - """Error during database operations.""" - pass - - -class ConfigurationError(InvoiceExtractionError): - """Error in configuration.""" - pass -``` - -2. 替换宽泛的异常捕获: -```python -# Before ❌ -try: - result = process_pdf(path) -except Exception as e: - logger.error(f"Error: {e}") - return None - -# After ✅ -try: - result = process_pdf(path) -except PDFProcessingError as e: - logger.error(f"PDF processing failed: {e}") - return None -except OCRError as e: - logger.warning(f"OCR failed, trying fallback: {e}") - result = fallback_ocr(path) -except ModelInferenceError as e: - logger.error(f"Model inference failed: {e}") - raise # Re-raise for upper layer -``` - -3. 在各模块中抛出具体异常: -```python -# src/inference/pdf_processor.py -from src.exceptions import PDFProcessingError - -def convert_pdf_to_image(pdf_path: Path, dpi: int) -> list[np.ndarray]: - try: - images = pdf2image.convert_from_path(pdf_path, dpi=dpi) - except Exception as e: - raise PDFProcessingError(f"Failed to convert PDF: {e}") from e - - if not images: - raise PDFProcessingError("PDF conversion returned no images") - - return images -``` - -4. 创建异常处理装饰器: -```python -# src/utils/error_handling.py -import functools -from typing import Callable, Type -from src.exceptions import InvoiceExtractionError - - -def handle_errors( - *exception_types: Type[Exception], - default_return=None, - log_error: bool = True -): - """Decorator for standardized error handling.""" - def decorator(func: Callable): - @functools.wraps(func) - def wrapper(*args, **kwargs): - try: - return func(*args, **kwargs) - except exception_types as e: - if log_error: - logger = logging.getLogger(func.__module__) - logger.error( - f"Error in {func.__name__}: {e}", - exc_info=True - ) - return default_return - return wrapper - return decorator - - -# Usage -@handle_errors(PDFProcessingError, OCRError, default_return=None) -def safe_process_document(doc_path: Path): - return process_document(doc_path) -``` - ---- - -### Step 8-12: 其他重构任务 - -详细步骤参见 `CODE_REVIEW_REPORT.md` Section 6 (Action Plan)。 - ---- - -## 🧪 测试策略 - -### 测试金字塔 - -``` - /\ - / \ E2E Tests (10%) - /----\ - Full pipeline tests - / \ - API integration tests - /--------\ - / \ Integration Tests (30%) - /------------\ - Module integration -/ \ - Database tests ----------------- - Unit Tests (60%) - - Function-level tests - - High coverage -``` - -### 测试覆盖率目标 - -| 模块 | 当前覆盖率 | 目标覆盖率 | -|------|-----------|-----------| -| `field_extractor.py` | 40% | 80% | -| `pipeline.py` | 50% | 75% | -| `payment_line_parser.py` | 0% (新) | 90% | -| `customer_number_parser.py` | 0% (新) | 90% | -| `web/routes.py` | 30% | 70% | -| `db/operations.py` | 20% | 60% | -| **Overall** | **45%** | **70%+** | - -### 回归测试 - -每次重构后必须运行: - -```bash -# 1. 单元测试 -pytest tests/unit/ -v - -# 2. 集成测试 -pytest tests/integration/ -v - -# 3. 端到端测试(使用实际PDF) -pytest tests/e2e/ -v - -# 4. 性能测试(确保没有退化) -pytest tests/performance/ -v --benchmark - -# 5. 测试覆盖率检查 -pytest --cov=src --cov-report=html --cov-fail-under=70 -``` - ---- - -## ⚠️ 风险管理 - -### 识别的风险 - -| 风险 | 影响 | 概率 | 缓解措施 | -|------|------|------|---------| -| 重构破坏现有功能 | 高 | 中 | 1. 重构前补充测试
2. 小步迭代
3. 回归测试 | -| 性能退化 | 中 | 低 | 1. 性能基准测试
2. 持续监控
3. Profile优化 | -| API接口变更影响客户端 | 高 | 低 | 1. 语义化版本控制
2. 废弃通知期
3. 向后兼容 | -| 数据库迁移失败 | 高 | 低 | 1. 备份数据
2. 分阶段迁移
3. 回滚计划 | -| 时间超期 | 中 | 中 | 1. 优先级排序
2. 每周进度审查
3. 必要时调整范围 | - -### 回滚计划 - -每个重构步骤都应有明确的回滚策略: - -1. **代码回滚**: 使用Git分支隔离变更 - ```bash - # 每个重构任务创建特性分支 - git checkout -b refactor/payment-line-parser - - # 如需回滚 - git checkout main - git branch -D refactor/payment-line-parser - ``` - -2. **数据库回滚**: 使用数据库迁移工具 - ```bash - # 应用迁移 - alembic upgrade head - - # 回滚迁移 - alembic downgrade -1 - ``` - -3. **配置回滚**: 保留旧配置兼容性 - ```python - # 支持新旧两种配置格式 - password = config.get("db_password") or config.get("password") - ``` - ---- - -## 📊 成功指标 - -### 量化指标 - -| 指标 | 当前值 | 目标值 | 测量方法 | -|------|--------|--------|---------| -| 测试覆盖率 | 45% | 70%+ | `pytest --cov` | -| 平均函数长度 | 80行 | <50行 | `radon cc` | -| 循环复杂度 | 最高15+ | <10 | `radon cc` | -| 代码重复率 | ~15% | <5% | `pylint --duplicate` | -| 安全问题 | 2个 (明文密码, SQL注入) | 0个 | 手动审查 + `bandit` | -| 文档覆盖率 | 30% | 80%+ | 手动审查 | -| 平均处理时间 | ~2秒/文档 | <2秒/文档 | 性能测试 | - -### 质量门禁 - -所有变更必须满足: -- ✅ 测试覆盖率 ≥ 70% -- ✅ 所有测试通过 (单元 + 集成 + E2E) -- ✅ 无高危安全问题 -- ✅ 代码审查通过 -- ✅ 性能无退化 (±5%以内) -- ✅ 文档已更新 - ---- - -## 📅 时间表 - -### Phase 1: 紧急修复 (Week 1) - -| 日期 | 任务 | 负责人 | 状态 | -|------|------|--------|------| -| Day 1 | 修复明文密码 + 环境变量配置 | | ⏳ | -| Day 2-3 | 修复SQL注入 + 添加参数化查询 | | ⏳ | -| Day 4-5 | 异常处理规范化 | | ⏳ | - -### Phase 2: 核心重构 (Week 2-4) - -| 周 | 任务 | 状态 | -|----|------|------| -| Week 2 | 统一payment_line解析 + 拆分customer_number | ⏳ | -| Week 3 | 重构pipeline + Extract Method | ⏳ | -| Week 4 | 添加集成测试 + 提升单元测试覆盖率 | ⏳ | - -### Phase 3: 优化改进 (Week 5-6) - -| 周 | 任务 | 状态 | -|----|------|------| -| Week 5 | 批处理优化 + 配置提取 | ⏳ | -| Week 6 | 文档完善 + 日志优化 + 性能调优 | ⏳ | - ---- - -## 🔄 持续改进 - -### Code Review Checklist - -每次提交前检查: -- [ ] 所有测试通过 -- [ ] 测试覆盖率达标 -- [ ] 无新增安全问题 -- [ ] 代码符合风格指南 -- [ ] 函数长度 < 50行 -- [ ] 循环复杂度 < 10 -- [ ] 文档已更新 -- [ ] 变更日志已记录 - -### 自动化工具 - -配置pre-commit hooks: -```yaml -# .pre-commit-config.yaml -repos: - - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 - hooks: - - id: trailing-whitespace - - id: end-of-file-fixer - - id: check-yaml - - id: check-added-large-files - - - repo: https://github.com/psf/black - rev: 23.3.0 - hooks: - - id: black - language_version: python3.11 - - - repo: https://github.com/PyCQA/flake8 - rev: 6.0.0 - hooks: - - id: flake8 - args: [--max-line-length=88, --extend-ignore=E203] - - - repo: https://github.com/PyCQA/bandit - rev: 1.7.5 - hooks: - - id: bandit - args: [-c, pyproject.toml] - - - repo: local - hooks: - - id: pytest-check - name: pytest-check - entry: pytest - language: system - pass_filenames: false - always_run: true - args: [-m, "not integration", --tb=short] -``` - ---- - -## 📚 参考资料 - -### 重构书籍 -- *Refactoring: Improving the Design of Existing Code* - Martin Fowler -- *Clean Code* - Robert C. Martin -- *Working Effectively with Legacy Code* - Michael Feathers - -### 设计模式 -- Strategy Pattern (customer_number patterns) -- Factory Pattern (parser creation) -- Template Method (field normalization) - -### Python最佳实践 -- PEP 8: Style Guide -- PEP 257: Docstring Conventions -- Google Python Style Guide - ---- - -## ✅ 验收标准 - -重构完成的定义: -1. ✅ 所有P0和P1任务完成 -2. ✅ 测试覆盖率 ≥ 70% -3. ✅ 安全问题全部修复 -4. ✅ 代码重复率 < 5% -5. ✅ 所有长函数 (>100行) 已拆分 -6. ✅ API文档完整 -7. ✅ 性能无退化 -8. ✅ 生产环境部署成功 - ---- - -**文档结束** - -下一步: 开始执行 Phase 1, Day 1 - 修复明文密码问题 diff --git a/docs/REFACTORING_SUMMARY.md b/docs/REFACTORING_SUMMARY.md deleted file mode 100644 index 06b5937..0000000 --- a/docs/REFACTORING_SUMMARY.md +++ /dev/null @@ -1,170 +0,0 @@ -# 代码重构总结报告 - -## 📊 整体成果 - -### 测试状态 -- ✅ **688/688 测试全部通过** (100%) -- ✅ **代码覆盖率**: 34% → 37% (+3%) -- ✅ **0 个失败**, 0 个错误 - -### 测试覆盖率改进 -- ✅ **machine_code_parser**: 25% → 65% (+40%) -- ✅ **新增测试**: 55个(633 → 688) - ---- - -## 🎯 已完成的重构 - -### 1. ✅ Matcher 模块化 (876行 → 205行, ↓76%) - -**文件**: - -**重构内容**: -- 将单一876行文件拆分为 **11个模块** -- 提取 **5种独立的匹配策略** -- 创建专门的数据模型、工具函数和上下文处理模块 - -**新模块结构**: - - -**测试结果**: -- ✅ 77个 matcher 测试全部通过 -- ✅ 完整的README文档 -- ✅ 策略模式,易于扩展 - -**收益**: -- 📉 代码量减少 76% -- 📈 可维护性显著提高 -- ✨ 每个策略独立测试 -- 🔧 易于添加新策略 - ---- - -### 2. ✅ Machine Code Parser 轻度重构 + 测试覆盖 (919行 → 929行) - -**文件**: src/ocr/machine_code_parser.py - -**重构内容**: -- 提取 **3个共享辅助方法**,消除重复代码 -- 优化上下文检测逻辑 -- 简化账号格式化方法 - -**测试改进**: -- ✅ **新增55个测试**(24 → 79个) -- ✅ **覆盖率**: 25% → 65% (+40%) -- ✅ 所有688个项目测试通过 - -**新增测试覆盖**: -- **第一轮** (22个测试): - - `_detect_account_context()` - 8个测试(上下文检测) - - `_normalize_account_spaces()` - 5个测试(空格规范化) - - `_format_account()` - 4个测试(账号格式化) - - `parse()` - 5个测试(主入口方法) -- **第二轮** (33个测试): - - `_extract_ocr()` - 8个测试(OCR 提取) - - `_extract_bankgiro()` - 9个测试(Bankgiro 提取) - - `_extract_plusgiro()` - 8个测试(Plusgiro 提取) - - `_extract_amount()` - 8个测试(金额提取) - -**收益**: -- 🔄 消除80行重复代码 -- 📈 可测试性提高(可独立测试辅助方法) -- 📖 代码可读性提升 -- ✅ 覆盖率从25%提升到65% (+40%) -- 🎯 低风险,高回报 - ---- - -### 3. ✅ Field Extractor 分析 (决定不重构) - -**文件**: (1183行) - -**分析结果**: ❌ **不应重构** - -**关键洞察**: -- 表面相似的代码可能有**完全不同的用途** -- field_extractor: **解析/提取** 字段值 -- src/normalize: **标准化/生成变体** 用于匹配 -- 两者职责不同,不应统一 - -**文档**: - ---- - -## 📈 重构统计 - -### 代码行数变化 - -| 文件 | 重构前 | 重构后 | 变化 | 百分比 | -|------|--------|--------|------|--------| -| **matcher/field_matcher.py** | 876行 | 205行 | -671 | ↓76% | -| **matcher/* (新增10个模块)** | 0行 | 466行 | +466 | 新增 | -| **matcher 总计** | 876行 | 671行 | -205 | ↓23% | -| **ocr/machine_code_parser.py** | 919行 | 929行 | +10 | +1% | -| **总净减少** | - | - | **-195行** | **↓11%** | - -### 测试覆盖 - -| 模块 | 测试数 | 通过率 | 覆盖率 | 状态 | -|------|--------|--------|--------|------| -| matcher | 77 | 100% | - | ✅ | -| field_extractor | 45 | 100% | 39% | ✅ | -| machine_code_parser | 79 | 100% | 65% | ✅ | -| normalizer | ~120 | 100% | - | ✅ | -| 其他模块 | ~367 | 100% | - | ✅ | -| **总计** | **688** | **100%** | **37%** | ✅ | - ---- - -## 🎓 重构经验总结 - -### 成功经验 - -1. **✅ 先测试后重构** - - 所有重构都有完整测试覆盖 - - 每次改动后立即验证测试 - - 100%测试通过率保证质量 - -2. **✅ 识别真正的重复** - - 不是所有相似代码都是重复 - - field_extractor vs normalizer: 表面相似但用途不同 - - machine_code_parser: 真正的代码重复 - -3. **✅ 渐进式重构** - - matcher: 大规模模块化 (策略模式) - - machine_code_parser: 轻度重构 (提取共享方法) - - field_extractor: 分析后决定不重构 - -### 关键决策 - -#### ✅ 应该重构的情况 -- **matcher**: 单一文件过长 (876行),包含多种策略 -- **machine_code_parser**: 多处相同用途的重复代码 - -#### ❌ 不应重构的情况 -- **field_extractor**: 相似代码有不同用途 - -### 教训 - -**不要盲目追求DRY原则** -> 相似代码不一定是重复。要理解代码的**真实用途**。 - ---- - -## ✅ 总结 - -**关键成果**: -- 📉 净减少 195 行代码 -- 📈 代码覆盖率 +3% (34% → 37%) -- ✅ 测试数量 +55 (633 → 688) -- 🎯 machine_code_parser 覆盖率 +40% (25% → 65%) -- ✨ 模块化程度显著提高 -- 🎯 可维护性大幅提升 - -**重要教训**: -> 相似的代码不一定是重复的代码。理解代码的真实用途,才能做出正确的重构决策。 - -**下一步建议**: -1. 继续提升 machine_code_parser 覆盖率到 80%+ (目前 65%) -2. 为其他低覆盖模块添加测试(field_extractor 39%, pipeline 19%) -3. 完善边界条件和异常情况的测试 diff --git a/docs/TEST_COVERAGE_IMPROVEMENT.md b/docs/TEST_COVERAGE_IMPROVEMENT.md deleted file mode 100644 index 15d3487..0000000 --- a/docs/TEST_COVERAGE_IMPROVEMENT.md +++ /dev/null @@ -1,258 +0,0 @@ -# 测试覆盖率改进报告 - -## 📊 改进概览 - -### 整体统计 -- ✅ **测试总数**: 633 → 688 (+55个测试, +8.7%) -- ✅ **通过率**: 100% (688/688) -- ✅ **整体覆盖率**: 34% → 37% (+3%) - -### machine_code_parser.py 专项改进 -- ✅ **测试数**: 24 → 79 (+55个测试, +229%) -- ✅ **覆盖率**: 25% → 65% (+40%) -- ✅ **未覆盖行**: 273 → 129 (减少144行) - ---- - -## 🎯 新增测试详情 - -### 第一轮改进 (22个测试) - -#### 1. TestDetectAccountContext (8个测试) - -测试新增的 `_detect_account_context()` 辅助方法。 - -**测试用例**: -1. `test_bankgiro_keyword` - 检测 'bankgiro' 关键词 -2. `test_bg_keyword` - 检测 'bg:' 缩写 -3. `test_plusgiro_keyword` - 检测 'plusgiro' 关键词 -4. `test_postgiro_keyword` - 检测 'postgiro' 别名 -5. `test_pg_keyword` - 检测 'pg:' 缩写 -6. `test_both_contexts` - 同时存在两种关键词 -7. `test_no_context` - 无账号关键词 -8. `test_case_insensitive` - 大小写不敏感检测 - -**覆盖的代码路径**: -```python -def _detect_account_context(self, tokens: list[TextToken]) -> dict[str, bool]: - context_text = ' '.join(t.text.lower() for t in tokens) - return { - 'bankgiro': any(kw in context_text for kw in ['bankgiro', 'bg:', 'bg ']), - 'plusgiro': any(kw in context_text for kw in ['plusgiro', 'postgiro', 'plusgirokonto', 'pg:', 'pg ']), - } -``` - ---- - -### 2. TestNormalizeAccountSpacesMethod (5个测试) - -测试新增的 `_normalize_account_spaces()` 辅助方法。 - -**测试用例**: -1. `test_removes_spaces_after_arrow` - 移除 > 后的空格 -2. `test_multiple_consecutive_spaces` - 处理多个连续空格 -3. `test_no_arrow_returns_unchanged` - 无 > 标记时返回原值 -4. `test_spaces_before_arrow_preserved` - 保留 > 前的空格 -5. `test_empty_string` - 空字符串处理 - -**覆盖的代码路径**: -```python -def _normalize_account_spaces(self, line: str) -> str: - if '>' not in line: - return line - parts = line.split('>', 1) - after_arrow = parts[1] - normalized = re.sub(r'(\d)\s+(\d)', r'\1\2', after_arrow) - while re.search(r'(\d)\s+(\d)', normalized): - normalized = re.sub(r'(\d)\s+(\d)', r'\1\2', normalized) - return parts[0] + '>' + normalized -``` - ---- - -### 3. TestFormatAccount (4个测试) - -测试新增的 `_format_account()` 辅助方法。 - -**测试用例**: -1. `test_plusgiro_context_forces_plusgiro` - Plusgiro 上下文强制格式化为 Plusgiro -2. `test_valid_bankgiro_7_digits` - 7位有效 Bankgiro 格式化 -3. `test_valid_bankgiro_8_digits` - 8位有效 Bankgiro 格式化 -4. `test_defaults_to_bankgiro_when_ambiguous` - 模糊情况默认 Bankgiro - -**覆盖的代码路径**: -```python -def _format_account(self, account_digits: str, is_plusgiro_context: bool) -> tuple[str, str]: - if is_plusgiro_context: - formatted = f"{account_digits[:-1]}-{account_digits[-1]}" - return formatted, 'plusgiro' - - # Luhn 验证逻辑 - pg_valid = FieldValidators.is_valid_plusgiro(account_digits) - bg_valid = FieldValidators.is_valid_bankgiro(account_digits) - - # 决策逻辑 - if pg_valid and not bg_valid: - return pg_formatted, 'plusgiro' - elif bg_valid and not pg_valid: - return bg_formatted, 'bankgiro' - else: - return bg_formatted, 'bankgiro' -``` - ---- - -### 4. TestParseMethod (5个测试) - -测试主入口 `parse()` 方法。 - -**测试用例**: -1. `test_parse_empty_tokens` - 空 token 列表处理 -2. `test_parse_finds_payment_line_in_bottom_region` - 在页面底部35%区域查找付款行 -3. `test_parse_ignores_top_region` - 忽略页面顶部区域 -4. `test_parse_with_context_keywords` - 检测上下文关键词 -5. `test_parse_stores_source_tokens` - 存储源 token - -**覆盖的代码路径**: -- Token 过滤(底部区域检测) -- 上下文关键词检测 -- 付款行查找和解析 -- 结果对象构建 - ---- - -### 第二轮改进 (33个测试) - -#### 5. TestExtractOCR (8个测试) - -测试 `_extract_ocr()` 方法 - OCR 参考号码提取。 - -**测试用例**: -1. `test_extract_valid_ocr_10_digits` - 提取10位 OCR 号码 -2. `test_extract_valid_ocr_15_digits` - 提取15位 OCR 号码 -3. `test_extract_ocr_with_hash_markers` - 带 # 标记的 OCR -4. `test_extract_longest_ocr_when_multiple` - 多个候选时选最长 -5. `test_extract_ocr_ignores_short_numbers` - 忽略短于10位的数字 -6. `test_extract_ocr_ignores_long_numbers` - 忽略长于25位的数字 -7. `test_extract_ocr_excludes_bankgiro_variants` - 排除 Bankgiro 变体 -8. `test_extract_ocr_empty_tokens` - 空 token 处理 - -#### 6. TestExtractBankgiro (9个测试) - -测试 `_extract_bankgiro()` 方法 - Bankgiro 账号提取。 - -**测试用例**: -1. `test_extract_bankgiro_7_digits_with_dash` - 带破折号的7位 Bankgiro -2. `test_extract_bankgiro_7_digits_without_dash` - 无破折号的7位 Bankgiro -3. `test_extract_bankgiro_8_digits_with_dash` - 带破折号的8位 Bankgiro -4. `test_extract_bankgiro_8_digits_without_dash` - 无破折号的8位 Bankgiro -5. `test_extract_bankgiro_with_spaces` - 带空格的 Bankgiro -6. `test_extract_bankgiro_handles_plusgiro_format` - 处理 Plusgiro 格式 -7. `test_extract_bankgiro_with_context` - 带上下文关键词 -8. `test_extract_bankgiro_ignores_plusgiro_context` - 忽略 Plusgiro 上下文 -9. `test_extract_bankgiro_empty_tokens` - 空 token 处理 - -#### 7. TestExtractPlusgiro (8个测试) - -测试 `_extract_plusgiro()` 方法 - Plusgiro 账号提取。 - -**测试用例**: -1. `test_extract_plusgiro_7_digits_with_dash` - 带破折号的7位 Plusgiro -2. `test_extract_plusgiro_7_digits_without_dash` - 无破折号的7位 Plusgiro -3. `test_extract_plusgiro_8_digits` - 8位 Plusgiro -4. `test_extract_plusgiro_with_spaces` - 带空格的 Plusgiro -5. `test_extract_plusgiro_with_context` - 带上下文关键词 -6. `test_extract_plusgiro_ignores_too_short` - 忽略少于7位 -7. `test_extract_plusgiro_ignores_too_long` - 忽略多于8位 -8. `test_extract_plusgiro_empty_tokens` - 空 token 处理 - -#### 8. TestExtractAmount (8个测试) - -测试 `_extract_amount()` 方法 - 金额提取。 - -**测试用例**: -1. `test_extract_amount_with_comma_decimal` - 逗号小数分隔符 -2. `test_extract_amount_with_dot_decimal` - 点号小数分隔符 -3. `test_extract_amount_integer` - 整数金额 -4. `test_extract_amount_with_thousand_separator` - 千位分隔符 -5. `test_extract_amount_large_number` - 大额金额 -6. `test_extract_amount_ignores_too_large` - 忽略过大金额 -7. `test_extract_amount_ignores_zero` - 忽略零或负数 -8. `test_extract_amount_empty_tokens` - 空 token 处理 - ---- - -## 📈 覆盖率分析 - -### 已覆盖的方法 -✅ `_detect_account_context()` - **100%** (第一轮新增) -✅ `_normalize_account_spaces()` - **100%** (第一轮新增) -✅ `_format_account()` - **95%** (第一轮新增) -✅ `parse()` - **70%** (第一轮改进) -✅ `_parse_standard_payment_line()` - **95%** (已有测试) -✅ `_extract_ocr()` - **85%** (第二轮新增) -✅ `_extract_bankgiro()` - **90%** (第二轮新增) -✅ `_extract_plusgiro()` - **90%** (第二轮新增) -✅ `_extract_amount()` - **80%** (第二轮新增) - -### 仍需改进的方法 (未覆盖/部分覆盖) -⚠️ `_calculate_confidence()` - **0%** (未测试) -⚠️ `cross_validate()` - **0%** (未测试) -⚠️ `get_region_bbox()` - **0%** (未测试) -⚠️ `_find_tokens_with_values()` - **部分覆盖** -⚠️ `_find_machine_code_line_tokens()` - **部分覆盖** - -### 未覆盖的代码行(129行) -主要集中在: -1. **验证方法** (lines 805-824): `_calculate_confidence`, `cross_validate` -2. **辅助方法** (lines 80-92, 336-369, 377-407): Token 查找、bbox 计算、日志记录 -3. **边界条件** (lines 648-653, 690, 699, 759-760等): 某些提取方法的边界情况 - ---- - -## 🎯 改进建议 - -### ✅ 已完成目标 -- ✅ 覆盖率从 25% 提升到 65% (+40%) -- ✅ 测试数量从 24 增加到 79 (+55个) -- ✅ 提取方法全部测试(_extract_ocr, _extract_bankgiro, _extract_plusgiro, _extract_amount) - -### 下一步目标(覆盖率 65% → 80%+) -1. **添加验证方法测试** - 为 `_calculate_confidence`, `cross_validate` 添加测试 -2. **添加辅助方法测试** - 为 token 查找和 bbox 计算方法添加测试 -3. **完善边界条件** - 增加边界情况和异常处理的测试 -4. **集成测试** - 添加端到端的集成测试,使用真实 PDF token 数据 - ---- - -## ✅ 已完成的改进 - -### 重构收益 -- ✅ 提取的3个辅助方法现在可以独立测试 -- ✅ 测试粒度更细,更容易定位问题 -- ✅ 代码可读性提高,测试用例清晰易懂 - -### 质量保证 -- ✅ 所有655个测试100%通过 -- ✅ 无回归问题 -- ✅ 新增测试覆盖了之前未测试的重构代码 - ---- - -## 📚 测试编写经验 - -### 成功经验 -1. **使用 fixture 创建测试数据** - `_create_token()` 辅助方法简化了 token 创建 -2. **按方法组织测试类** - 每个方法一个测试类,结构清晰 -3. **测试用例命名清晰** - `test__` 格式,一目了然 -4. **覆盖关键路径** - 优先测试常见场景和边界条件 - -### 遇到的问题 -1. **Token 初始化参数** - 忘记了 `page_no` 参数,导致初始测试失败 - - 解决:修复 `_create_token()` 辅助方法,添加 `page_no=0` - ---- - -**报告日期**: 2026-01-24 -**状态**: ✅ 完成 -**下一步**: 继续提升覆盖率到 60%+ diff --git a/docs/multi_pool_design.md b/docs/multi_pool_design.md deleted file mode 100644 index 6cbce39..0000000 --- a/docs/multi_pool_design.md +++ /dev/null @@ -1,619 +0,0 @@ -# 多池处理架构设计文档 - -## 1. 研究总结 - -### 1.1 当前问题分析 - -我们之前实现的双池模式存在稳定性问题,主要原因: - -| 问题 | 原因 | 解决方案 | -|------|------|----------| -| 处理卡住 | 线程 + ProcessPoolExecutor 混用导致死锁 | 使用 asyncio 或纯 Queue 模式 | -| Queue.get() 无限阻塞 | 没有超时机制 | 添加 timeout 和哨兵值 | -| GPU 内存冲突 | 多进程同时访问 GPU | 限制 GPU worker = 1 | -| CUDA fork 问题 | Linux 默认 fork 不兼容 CUDA | 使用 spawn 启动方式 | - -### 1.2 推荐架构方案 - -经过研究,最适合我们场景的方案是 **生产者-消费者队列模式**: - -``` -┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ -│ Main Process │ │ CPU Workers │ │ GPU Worker │ -│ │ │ (4 processes) │ │ (1 process) │ -│ ┌───────────┐ │ │ │ │ │ -│ │ Task │──┼────▶│ Text PDF处理 │ │ Scanned PDF处理 │ -│ │ Dispatcher│ │ │ (无需OCR) │ │ (PaddleOCR) │ -│ └───────────┘ │ │ │ │ │ -│ ▲ │ │ │ │ │ │ │ -│ │ │ │ ▼ │ │ ▼ │ -│ ┌───────────┐ │ │ Result Queue │ │ Result Queue │ -│ │ Result │◀─┼─────│◀────────────────│─────│◀────────────────│ -│ │ Collector │ │ │ │ │ │ -│ └───────────┘ │ └─────────────────┘ └─────────────────┘ -│ │ │ -│ ▼ │ -│ ┌───────────┐ │ -│ │ Database │ │ -│ │ Batch │ │ -│ │ Writer │ │ -│ └───────────┘ │ -└─────────────────┘ -``` - ---- - -## 2. 核心设计原则 - -### 2.1 CUDA 兼容性 - -```python -# 关键:使用 spawn 启动方式 -import multiprocessing as mp -ctx = mp.get_context("spawn") - -# GPU worker 初始化时设置设备 -def init_gpu_worker(gpu_id: int = 0): - os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) - global _ocr - from paddleocr import PaddleOCR - _ocr = PaddleOCR(use_gpu=True, ...) -``` - -### 2.2 Worker 初始化模式 - -使用 `initializer` 参数一次性加载模型,避免每个任务重新加载: - -```python -# 全局变量保存模型 -_ocr = None - -def init_worker(use_gpu: bool, gpu_id: int = 0): - global _ocr - if use_gpu: - os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) - else: - os.environ["CUDA_VISIBLE_DEVICES"] = "-1" - - from paddleocr import PaddleOCR - _ocr = PaddleOCR(use_gpu=use_gpu, ...) - -# 创建 Pool 时使用 initializer -pool = ProcessPoolExecutor( - max_workers=1, - initializer=init_worker, - initargs=(True, 0), # use_gpu=True, gpu_id=0 - mp_context=mp.get_context("spawn") -) -``` - -### 2.3 队列模式 vs as_completed - -| 方式 | 优点 | 缺点 | 适用场景 | -|------|------|------|----------| -| `as_completed()` | 简单、无需管理队列 | 无法跨多个 Pool 使用 | 单池场景 | -| `multiprocessing.Queue` | 高性能、灵活 | 需要手动管理、死锁风险 | 多池流水线 | -| `Manager().Queue()` | 可 pickle、跨 Pool | 性能较低 | 需要 Pool.map 场景 | - -**推荐**:对于双池场景,使用 `as_completed()` 分别处理每个池,然后合并结果。 - ---- - -## 3. 详细开发计划 - -### 阶段 1:重构基础架构 (2-3天) - -#### 1.1 创建 WorkerPool 抽象类 - -```python -# src/processing/worker_pool.py - -from __future__ import annotations -from abc import ABC, abstractmethod -from concurrent.futures import ProcessPoolExecutor, Future -from dataclasses import dataclass -from typing import List, Any, Optional, Callable -import multiprocessing as mp - -@dataclass -class TaskResult: - """任务结果容器""" - task_id: str - success: bool - data: Any - error: Optional[str] = None - processing_time: float = 0.0 - -class WorkerPool(ABC): - """Worker Pool 抽象基类""" - - def __init__(self, max_workers: int, use_gpu: bool = False, gpu_id: int = 0): - self.max_workers = max_workers - self.use_gpu = use_gpu - self.gpu_id = gpu_id - self._executor: Optional[ProcessPoolExecutor] = None - - @abstractmethod - def get_initializer(self) -> Callable: - """返回 worker 初始化函数""" - pass - - @abstractmethod - def get_init_args(self) -> tuple: - """返回初始化参数""" - pass - - def start(self): - """启动 worker pool""" - ctx = mp.get_context("spawn") - self._executor = ProcessPoolExecutor( - max_workers=self.max_workers, - mp_context=ctx, - initializer=self.get_initializer(), - initargs=self.get_init_args() - ) - - def submit(self, fn: Callable, *args, **kwargs) -> Future: - """提交任务""" - if not self._executor: - raise RuntimeError("Pool not started") - return self._executor.submit(fn, *args, **kwargs) - - def shutdown(self, wait: bool = True): - """关闭 pool""" - if self._executor: - self._executor.shutdown(wait=wait) - self._executor = None - - def __enter__(self): - self.start() - return self - - def __exit__(self, *args): - self.shutdown() -``` - -#### 1.2 实现 CPU 和 GPU Worker Pool - -```python -# src/processing/cpu_pool.py - -class CPUWorkerPool(WorkerPool): - """CPU-only worker pool for text PDF processing""" - - def __init__(self, max_workers: int = 4): - super().__init__(max_workers=max_workers, use_gpu=False) - - def get_initializer(self) -> Callable: - return init_cpu_worker - - def get_init_args(self) -> tuple: - return () - -# src/processing/gpu_pool.py - -class GPUWorkerPool(WorkerPool): - """GPU worker pool for OCR processing""" - - def __init__(self, max_workers: int = 1, gpu_id: int = 0): - super().__init__(max_workers=max_workers, use_gpu=True, gpu_id=gpu_id) - - def get_initializer(self) -> Callable: - return init_gpu_worker - - def get_init_args(self) -> tuple: - return (self.gpu_id,) -``` - ---- - -### 阶段 2:实现双池协调器 (2-3天) - -#### 2.1 任务分发器 - -```python -# src/processing/task_dispatcher.py - -from dataclasses import dataclass -from enum import Enum, auto -from typing import List, Tuple - -class TaskType(Enum): - CPU = auto() # Text PDF - GPU = auto() # Scanned PDF - -@dataclass -class Task: - id: str - task_type: TaskType - data: Any - -class TaskDispatcher: - """根据 PDF 类型分发任务到不同的 pool""" - - def classify_task(self, doc_info: dict) -> TaskType: - """判断文档是否需要 OCR""" - # 基于 PDF 特征判断 - if self._is_scanned_pdf(doc_info): - return TaskType.GPU - return TaskType.CPU - - def _is_scanned_pdf(self, doc_info: dict) -> bool: - """检测是否为扫描件""" - # 1. 检查是否有可提取文本 - # 2. 检查图片比例 - # 3. 检查文本密度 - pass - - def partition_tasks(self, tasks: List[Task]) -> Tuple[List[Task], List[Task]]: - """将任务分为 CPU 和 GPU 两组""" - cpu_tasks = [t for t in tasks if t.task_type == TaskType.CPU] - gpu_tasks = [t for t in tasks if t.task_type == TaskType.GPU] - return cpu_tasks, gpu_tasks -``` - -#### 2.2 双池协调器 - -```python -# src/processing/dual_pool_coordinator.py - -from concurrent.futures import as_completed -from typing import List, Iterator -import logging - -logger = logging.getLogger(__name__) - -class DualPoolCoordinator: - """协调 CPU 和 GPU 两个 worker pool""" - - def __init__( - self, - cpu_workers: int = 4, - gpu_workers: int = 1, - gpu_id: int = 0 - ): - self.cpu_pool = CPUWorkerPool(max_workers=cpu_workers) - self.gpu_pool = GPUWorkerPool(max_workers=gpu_workers, gpu_id=gpu_id) - self.dispatcher = TaskDispatcher() - - def __enter__(self): - self.cpu_pool.start() - self.gpu_pool.start() - return self - - def __exit__(self, *args): - self.cpu_pool.shutdown() - self.gpu_pool.shutdown() - - def process_batch( - self, - documents: List[dict], - cpu_task_fn: Callable, - gpu_task_fn: Callable, - on_result: Optional[Callable[[TaskResult], None]] = None, - on_error: Optional[Callable[[str, Exception], None]] = None - ) -> List[TaskResult]: - """ - 处理一批文档,自动分发到 CPU 或 GPU pool - - Args: - documents: 待处理文档列表 - cpu_task_fn: CPU 任务处理函数 - gpu_task_fn: GPU 任务处理函数 - on_result: 结果回调(可选) - on_error: 错误回调(可选) - - Returns: - 所有任务结果列表 - """ - # 分类任务 - tasks = [ - Task(id=doc['id'], task_type=self.dispatcher.classify_task(doc), data=doc) - for doc in documents - ] - cpu_tasks, gpu_tasks = self.dispatcher.partition_tasks(tasks) - - logger.info(f"Task partition: {len(cpu_tasks)} CPU, {len(gpu_tasks)} GPU") - - # 提交任务到各自的 pool - cpu_futures = { - self.cpu_pool.submit(cpu_task_fn, t.data): t.id - for t in cpu_tasks - } - gpu_futures = { - self.gpu_pool.submit(gpu_task_fn, t.data): t.id - for t in gpu_tasks - } - - # 收集结果 - results = [] - all_futures = list(cpu_futures.keys()) + list(gpu_futures.keys()) - - for future in as_completed(all_futures): - task_id = cpu_futures.get(future) or gpu_futures.get(future) - pool_type = "CPU" if future in cpu_futures else "GPU" - - try: - data = future.result(timeout=300) # 5分钟超时 - result = TaskResult(task_id=task_id, success=True, data=data) - if on_result: - on_result(result) - except Exception as e: - logger.error(f"[{pool_type}] Task {task_id} failed: {e}") - result = TaskResult(task_id=task_id, success=False, data=None, error=str(e)) - if on_error: - on_error(task_id, e) - - results.append(result) - - return results -``` - ---- - -### 阶段 3:集成到 autolabel (1-2天) - -#### 3.1 修改 autolabel.py - -```python -# src/cli/autolabel.py - -def run_autolabel_dual_pool(args): - """使用双池模式运行自动标注""" - - from src.processing.dual_pool_coordinator import DualPoolCoordinator - - # 初始化数据库批处理 - db_batch = [] - db_batch_size = 100 - - def on_result(result: TaskResult): - """处理成功结果""" - nonlocal db_batch - db_batch.append(result.data) - - if len(db_batch) >= db_batch_size: - save_documents_batch(db_batch) - db_batch.clear() - - def on_error(task_id: str, error: Exception): - """处理错误""" - logger.error(f"Task {task_id} failed: {error}") - - # 创建双池协调器 - with DualPoolCoordinator( - cpu_workers=args.cpu_workers or 4, - gpu_workers=args.gpu_workers or 1, - gpu_id=0 - ) as coordinator: - - # 处理所有 CSV - for csv_file in csv_files: - documents = load_documents_from_csv(csv_file) - - results = coordinator.process_batch( - documents=documents, - cpu_task_fn=process_text_pdf, - gpu_task_fn=process_scanned_pdf, - on_result=on_result, - on_error=on_error - ) - - logger.info(f"CSV {csv_file}: {len(results)} processed") - - # 保存剩余批次 - if db_batch: - save_documents_batch(db_batch) -``` - ---- - -### 阶段 4:测试与验证 (1-2天) - -#### 4.1 单元测试 - -```python -# tests/unit/test_dual_pool.py - -import pytest -from src.processing.dual_pool_coordinator import DualPoolCoordinator, TaskResult - -class TestDualPoolCoordinator: - - def test_cpu_only_batch(self): - """测试纯 CPU 任务批处理""" - with DualPoolCoordinator(cpu_workers=2, gpu_workers=1) as coord: - docs = [{"id": f"doc_{i}", "type": "text"} for i in range(10)] - results = coord.process_batch(docs, cpu_fn, gpu_fn) - assert len(results) == 10 - assert all(r.success for r in results) - - def test_mixed_batch(self): - """测试混合任务批处理""" - with DualPoolCoordinator(cpu_workers=2, gpu_workers=1) as coord: - docs = [ - {"id": "text_1", "type": "text"}, - {"id": "scan_1", "type": "scanned"}, - {"id": "text_2", "type": "text"}, - ] - results = coord.process_batch(docs, cpu_fn, gpu_fn) - assert len(results) == 3 - - def test_timeout_handling(self): - """测试超时处理""" - pass - - def test_error_recovery(self): - """测试错误恢复""" - pass -``` - -#### 4.2 集成测试 - -```python -# tests/integration/test_autolabel_dual_pool.py - -def test_autolabel_with_dual_pool(): - """端到端测试双池模式""" - # 使用少量测试数据 - result = subprocess.run([ - "python", "-m", "src.cli.autolabel", - "--cpu-workers", "2", - "--gpu-workers", "1", - "--limit", "50" - ], capture_output=True) - - assert result.returncode == 0 - # 验证数据库记录 -``` - ---- - -## 4. 关键技术点 - -### 4.1 避免死锁的策略 - -```python -# 1. 使用 timeout -try: - result = future.result(timeout=300) -except TimeoutError: - logger.warning(f"Task timed out") - -# 2. 使用哨兵值 -SENTINEL = object() -queue.put(SENTINEL) # 发送结束信号 - -# 3. 检查进程状态 -if not worker.is_alive(): - logger.error("Worker died unexpectedly") - break - -# 4. 先清空队列再 join -while not queue.empty(): - results.append(queue.get_nowait()) -worker.join(timeout=5.0) -``` - -### 4.2 PaddleOCR 特殊处理 - -```python -# PaddleOCR 必须在 worker 进程中初始化 -def init_paddle_worker(gpu_id: int): - global _ocr - import os - os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) - - # 延迟导入,确保 CUDA 环境变量生效 - from paddleocr import PaddleOCR - _ocr = PaddleOCR( - use_angle_cls=True, - lang='en', - use_gpu=True, - show_log=False, - # 重要:设置 GPU 内存比例 - gpu_mem=2000 # 限制 GPU 内存使用 (MB) - ) -``` - -### 4.3 资源监控 - -```python -import psutil -import GPUtil - -def get_resource_usage(): - """获取系统资源使用情况""" - cpu_percent = psutil.cpu_percent(interval=1) - memory = psutil.virtual_memory() - - gpu_info = [] - for gpu in GPUtil.getGPUs(): - gpu_info.append({ - "id": gpu.id, - "memory_used": gpu.memoryUsed, - "memory_total": gpu.memoryTotal, - "utilization": gpu.load * 100 - }) - - return { - "cpu_percent": cpu_percent, - "memory_percent": memory.percent, - "gpu": gpu_info - } -``` - ---- - -## 5. 风险评估与应对 - -| 风险 | 可能性 | 影响 | 应对策略 | -|------|--------|------|----------| -| GPU 内存不足 | 中 | 高 | 限制 GPU worker = 1,设置 gpu_mem 参数 | -| 进程僵死 | 低 | 高 | 添加心跳检测,超时自动重启 | -| 任务分类错误 | 中 | 中 | 添加回退机制,CPU 失败后尝试 GPU | -| 数据库写入瓶颈 | 低 | 中 | 增大批处理大小,异步写入 | - ---- - -## 6. 备选方案 - -如果上述方案仍存在问题,可以考虑: - -### 6.1 使用 Ray - -```python -import ray - -ray.init() - -@ray.remote(num_cpus=1) -def cpu_task(data): - return process_text_pdf(data) - -@ray.remote(num_gpus=1) -def gpu_task(data): - return process_scanned_pdf(data) - -# 自动资源调度 -futures = [cpu_task.remote(d) for d in cpu_docs] -futures += [gpu_task.remote(d) for d in gpu_docs] -results = ray.get(futures) -``` - -### 6.2 单池 + 动态 GPU 调度 - -保持单池模式,但在每个任务内部动态决定是否使用 GPU: - -```python -def process_document(doc_data): - if is_scanned_pdf(doc_data): - # 使用 GPU (需要全局锁或信号量控制并发) - with gpu_semaphore: - return process_with_ocr(doc_data) - else: - return process_text_only(doc_data) -``` - ---- - -## 7. 时间线总结 - -| 阶段 | 任务 | 预计工作量 | -|------|------|------------| -| 阶段 1 | 基础架构重构 | 2-3 天 | -| 阶段 2 | 双池协调器实现 | 2-3 天 | -| 阶段 3 | 集成到 autolabel | 1-2 天 | -| 阶段 4 | 测试与验证 | 1-2 天 | -| **总计** | | **6-10 天** | - ---- - -## 8. 参考资料 - -1. [Python concurrent.futures 官方文档](https://docs.python.org/3/library/concurrent.futures.html) -2. [PyTorch Multiprocessing Best Practices](https://docs.pytorch.org/docs/stable/notes/multiprocessing.html) -3. [Super Fast Python - ProcessPoolExecutor 完整指南](https://superfastpython.com/processpoolexecutor-in-python/) -4. [PaddleOCR 并行推理文档](http://www.paddleocr.ai/main/en/version3.x/pipeline_usage/instructions/parallel_inference.html) -5. [AWS - 跨 CPU/GPU 并行化 ML 推理](https://aws.amazon.com/blogs/machine-learning/parallelizing-across-multiple-cpu-gpus-to-speed-up-deep-learning-inference-at-the-edge/) -6. [Ray 分布式多进程处理](https://docs.ray.io/en/latest/ray-more-libs/multiprocessing.html) diff --git a/docs/product-plan-v2.md b/docs/product-plan-v2.md new file mode 100644 index 0000000..d5f8530 --- /dev/null +++ b/docs/product-plan-v2.md @@ -0,0 +1,1223 @@ +# Document Annotation Tool - Product Plan v2 + +## Table of Contents +1. [Product Requirements Document (PRD)](#1-product-requirements-document-prd) +2. [CSV Format Specification](#2-csv-format-specification) +3. [Database Schema Changes](#3-database-schema-changes) +4. [API Specification](#4-api-specification) +5. [UI Wireframes (Text-Based)](#5-ui-wireframes-text-based) +6. [Implementation Phases](#6-implementation-phases) +7. [State Machine Diagrams](#7-state-machine-diagrams) + +--- + +## 1. Product Requirements Document (PRD) + +### 1.1 Overview + +This enhancement adds batch upload capabilities, document lifecycle management, manual annotation workflow with auto-label dependency, comprehensive training management, and enhanced document detail views to the Invoice Master Document Annotation Tool. + +### 1.2 User Stories + +#### Epic 1: Batch Upload (ZIP Support) + +| ID | User Story | Acceptance Criteria | Priority | +|----|------------|---------------------|----------| +| US-1.1 | As a user, I want to upload a ZIP file containing multiple PDFs so that I can process many documents at once | - ZIP file is extracted
- Each PDF is registered as a separate document
- Document IDs are returned for all files
- Invalid files are skipped with error message | P0 | +| US-1.2 | As a user, I want to include a CSV file in my ZIP for auto-labeling so that annotations are created automatically | - CSV is parsed and validated
- DocumentId column maps to PDF filenames
- Field values are stored for auto-labeling
- Invalid CSV rows are logged | P0 | +| US-1.3 | As a user, I want to upload a single PDF with auto-label values via API so that I can integrate with my workflow | - PDF is uploaded
- Auto-label values provided in JSON body
- Auto-labeling runs automatically
- Document ID returned | P0 | +| US-1.4 | As a user, I want clear feedback on batch upload progress so that I know which files succeeded or failed | - Upload progress indicator
- Per-file status (success/failed)
- Error messages for failed files
- Summary count displayed | P1 | + +#### Epic 2: Document List and Status + +| ID | User Story | Acceptance Criteria | Priority | +|----|------------|---------------------|----------| +| US-2.1 | As a user, I want to see a list of all uploaded documents so that I can manage my annotations | - Paginated document list
- Shows filename, status, date
- Sortable columns
- Search/filter capability | P0 | +| US-2.2 | As a user, I want to see auto-label status for each document so that I know processing progress | - Status badge: pending, processing, completed, failed
- Progress indicator for processing
- Error message for failed | P0 | +| US-2.3 | As a user, I want to see the upload source (API vs UI) so that I can track document origin | - Source column in list
- Filter by source
- Source shown in detail view | P1 | +| US-2.4 | As a user, I want to see annotation preview for completed documents so that I can quickly review | - Thumbnail with overlaid bounding boxes
- Annotation count badge
- Click to view full detail | P1 | + +#### Epic 3: Manual Annotation with Auto-Label Dependency + +| ID | User Story | Acceptance Criteria | Priority | +|----|------------|---------------------|----------| +| US-3.1 | As a user, I want to be blocked from manual annotation if auto-label is pending so that I don't lose work | - Clear message: "Auto-labeling in progress, please wait"
- Refresh button to check status
- Automatic unlock when complete | P0 | +| US-3.2 | As a user, I want to override auto-generated annotations so that I can correct errors | - Can edit any annotation
- Source changes from "auto" to "manual"
- Original auto value preserved in history
- Override timestamp recorded | P0 | +| US-3.3 | As a user, I want to see which annotations are manual vs auto so that I can review confidence | - Color-coded annotation badges
- Manual: solid border
- Auto: dashed border with confidence %
- Filter by source | P0 | +| US-3.4 | As a user, I want to accept or reject individual auto-annotations so that I can curate training data | - Accept button marks as verified
- Reject button removes annotation
- Bulk accept/reject actions | P1 | + +#### Epic 4: Training Page Features + +| ID | User Story | Acceptance Criteria | Priority | +|----|------------|---------------------|----------| +| US-4.1 | As a user, I want to see all documents available for training so that I can select training data | - Filtered list (only labeled documents)
- Shows annotation count per document
- Checkbox selection
- Select all/none options | P0 | +| US-4.2 | As a user, I want to select specific documents for training so that I can control data quality | - Multi-select with checkboxes
- Selection count displayed
- Clear selection button
- Persisted selection state | P0 | +| US-4.3 | As a user, I want to see all trained models so that I can track model history | - Model list with name, date, status
- Document count used
- mAP/accuracy metrics
- Download model link | P0 | +| US-4.4 | As a user, I want to see which documents were used in training so that I can track data lineage | - "Used in training" badge on documents
- Click to see model list
- Filter documents by training status | P1 | +| US-4.5 | As a user, I want to start a training job with selected documents so that I can create new models | - Start training button
- Training config options
- Progress monitoring
- Email notification on completion | P0 | + +#### Epic 5: Document Detail View (Enhanced) + +| ID | User Story | Acceptance Criteria | Priority | +|----|------------|---------------------|----------| +| US-5.1 | As a user, I want to see all annotations with their source so that I can review data quality | - Annotation list with source column
- Confidence score for auto
- Edit/delete buttons
- Group by field type | P0 | +| US-5.2 | As a user, I want to see training history for a document so that I can understand model lineage | - List of models using this document
- Training date and model name
- Link to model detail page | P1 | +| US-5.3 | As a user, I want to edit annotations inline so that I can quickly make corrections | - Click to edit bounding box
- Drag to resize
- Double-click to edit text value
- Save/cancel buttons | P0 | +| US-5.4 | As a user, I want to see auto vs manual annotation comparison so that I can evaluate auto-label quality | - Side-by-side comparison view
- Highlight differences
- Override history timeline | P2 | + +#### Epic 6: API Endpoints + +| ID | User Story | Acceptance Criteria | Priority | +|----|------------|---------------------|----------| +| US-6.1 | As a developer, I want to upload ZIP/PDF via API so that I can automate document ingestion | - POST endpoint accepts multipart
- Returns document IDs array
- Async processing option
- Webhook callback support | P0 | +| US-6.2 | As a developer, I want to upload PDF with auto-label values so that I can pre-annotate documents | - JSON body with field values
- Auto-label runs synchronously or async
- Returns annotation IDs | P0 | +| US-6.3 | As a developer, I want to query document status so that I can poll for completion | - GET endpoint with document ID
- Returns full status object
- Includes annotation summary | P0 | +| US-6.4 | As a developer, I want API-uploaded documents visible in UI so that I can manage all documents centrally | - Same data model for API/UI uploads
- Source field distinguishes origin
- Full UI functionality available | P0 | + +--- + +## 2. CSV Format Specification + +### 2.1 Required Headers + +```csv +customer_number,supplier_name,supplier_organisation_number,supplier_accounts,DocumentId,InvoiceNumber,InvoiceDate,InvoiceDueDate,Amount,OCR,Message,Bankgiro,Plusgiro +``` + +### 2.2 Column Definitions + +| Column | Type | Required | Maps to Class | Description | Validation Rules | +|--------|------|----------|---------------|-------------|------------------| +| `DocumentId` | string | YES | N/A | PDF filename (without .pdf extension) | Non-empty, alphanumeric + underscore/hyphen | +| `customer_number` | string | NO | customer_number (9) | Customer reference number | Max 50 chars | +| `supplier_name` | string | NO | N/A (metadata only) | Supplier company name | Max 255 chars | +| `supplier_organisation_number` | string | NO | supplier_organisation_number (7) | Swedish org number (XXXXXX-XXXX) | Format: 6 digits, hyphen, 4 digits | +| `supplier_accounts` | string | NO | N/A (metadata) | Pipe-separated account numbers | Max 500 chars | +| `InvoiceNumber` | string | NO | invoice_number (0) | Invoice reference | Max 50 chars | +| `InvoiceDate` | date | NO | invoice_date (1) | Invoice issue date | ISO 8601 or YYYY-MM-DD | +| `InvoiceDueDate` | date | NO | invoice_due_date (2) | Payment due date | ISO 8601 or YYYY-MM-DD | +| `Amount` | decimal | NO | amount (6) | Invoice total amount | Numeric, max 2 decimal places | +| `OCR` | string | NO | ocr_number (3) | Swedish OCR payment reference | Numeric string, max 25 chars | +| `Message` | string | NO | N/A (metadata only) | Free-text payment message | Max 140 chars | +| `Bankgiro` | string | NO | bankgiro (4) | Bankgiro account number | Format: XXX-XXXX or 7-8 digits | +| `Plusgiro` | string | NO | plusgiro (5) | Plusgiro account number | Format: XXXXXX-X or 6-8 digits | + +### 2.3 Field to Class Mapping + +```python +CSV_TO_CLASS_MAPPING = { + 'InvoiceNumber': 0, # invoice_number + 'InvoiceDate': 1, # invoice_date + 'InvoiceDueDate': 2, # invoice_due_date + 'OCR': 3, # ocr_number + 'Bankgiro': 4, # bankgiro + 'Plusgiro': 5, # plusgiro + 'Amount': 6, # amount + 'supplier_organisation_number': 7, # supplier_organisation_number + # 8: payment_line (derived from OCR/Bankgiro/Amount) + 'customer_number': 9, # customer_number +} +``` + +### 2.4 Example CSV + +```csv +customer_number,supplier_name,supplier_organisation_number,supplier_accounts,DocumentId,InvoiceNumber,InvoiceDate,InvoiceDueDate,Amount,OCR,Message,Bankgiro,Plusgiro +C12345,ACME Corp,556677-8899,123-4567|987-6543,INV001,F2024-001,2024-01-15,2024-02-15,1250.00,7350012345678,,123-4567, +C12346,Widget AB,112233-4455,,INV002,F2024-002,2024-01-16,2024-02-16,3450.50,,,987-6543, +``` + +### 2.5 Validation Rules + +1. **DocumentId**: Required, must match a PDF filename in the ZIP +2. **At least one matchable field**: One of InvoiceNumber, OCR, Bankgiro, Plusgiro, Amount, supplier_organisation_number must be non-empty +3. **Date formats**: YYYY-MM-DD, DD/MM/YYYY, DD.MM.YYYY +4. **Amount formats**: 1234.56, 1 234,56, 1234,56 SEK +5. **Swedish org number**: XXXXXX-XXXX pattern + +--- + +## 3. Database Schema Changes + +### 3.1 New Tables + +#### 3.1.1 BatchUpload Table + +```sql +CREATE TABLE batch_uploads ( + batch_id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + admin_token VARCHAR(255) NOT NULL REFERENCES admin_tokens(token), + filename VARCHAR(255) NOT NULL, + file_size INTEGER NOT NULL, + upload_source VARCHAR(20) NOT NULL DEFAULT 'ui', -- 'ui' or 'api' + status VARCHAR(20) NOT NULL DEFAULT 'processing', + -- Status: processing, completed, partial, failed + total_files INTEGER DEFAULT 0, + processed_files INTEGER DEFAULT 0, + successful_files INTEGER DEFAULT 0, + failed_files INTEGER DEFAULT 0, + error_message TEXT, + created_at TIMESTAMP NOT NULL DEFAULT NOW(), + completed_at TIMESTAMP +); + +CREATE INDEX idx_batch_uploads_admin_token ON batch_uploads(admin_token); +CREATE INDEX idx_batch_uploads_status ON batch_uploads(status); +``` + +#### 3.1.2 BatchUploadFile Table + +```sql +CREATE TABLE batch_upload_files ( + file_id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + batch_id UUID NOT NULL REFERENCES batch_uploads(batch_id) ON DELETE CASCADE, + document_id UUID REFERENCES admin_documents(document_id), + filename VARCHAR(255) NOT NULL, + status VARCHAR(20) NOT NULL DEFAULT 'pending', + -- Status: pending, processing, completed, failed, skipped + error_message TEXT, + csv_row_data JSONB, -- Parsed CSV row for this file + created_at TIMESTAMP NOT NULL DEFAULT NOW(), + processed_at TIMESTAMP +); + +CREATE INDEX idx_batch_upload_files_batch_id ON batch_upload_files(batch_id); +CREATE INDEX idx_batch_upload_files_document_id ON batch_upload_files(document_id); +``` + +#### 3.1.3 TrainingDocumentLink Table (Junction Table) + +```sql +CREATE TABLE training_document_links ( + link_id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + task_id UUID NOT NULL REFERENCES training_tasks(task_id) ON DELETE CASCADE, + document_id UUID NOT NULL REFERENCES admin_documents(document_id) ON DELETE CASCADE, + annotation_snapshot JSONB, -- Snapshot of annotations at training time + created_at TIMESTAMP NOT NULL DEFAULT NOW(), + + UNIQUE(task_id, document_id) +); + +CREATE INDEX idx_training_doc_links_task_id ON training_document_links(task_id); +CREATE INDEX idx_training_doc_links_document_id ON training_document_links(document_id); +``` + +#### 3.1.4 AnnotationHistory Table + +```sql +CREATE TABLE annotation_history ( + history_id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + annotation_id UUID NOT NULL REFERENCES admin_annotations(annotation_id) ON DELETE CASCADE, + action VARCHAR(20) NOT NULL, -- 'created', 'updated', 'deleted', 'override' + previous_value JSONB, -- Full annotation state before change + new_value JSONB, -- Full annotation state after change + changed_by VARCHAR(255), -- admin_token + change_reason TEXT, + created_at TIMESTAMP NOT NULL DEFAULT NOW() +); + +CREATE INDEX idx_annotation_history_annotation_id ON annotation_history(annotation_id); +CREATE INDEX idx_annotation_history_created_at ON annotation_history(created_at); +``` + +### 3.2 Modified Tables + +#### 3.2.1 AdminDocument Modifications + +```sql +ALTER TABLE admin_documents ADD COLUMN upload_source VARCHAR(20) DEFAULT 'ui'; +-- Values: 'ui', 'api' + +ALTER TABLE admin_documents ADD COLUMN batch_id UUID REFERENCES batch_uploads(batch_id); + +ALTER TABLE admin_documents ADD COLUMN csv_field_values JSONB; +-- Stores original CSV values for reference + +ALTER TABLE admin_documents ADD COLUMN auto_label_queued_at TIMESTAMP; +-- When auto-label was queued (for dependency checking) + +ALTER TABLE admin_documents ADD COLUMN annotation_lock_until TIMESTAMP; +-- Lock for manual annotation while auto-label runs + +CREATE INDEX idx_admin_documents_upload_source ON admin_documents(upload_source); +CREATE INDEX idx_admin_documents_batch_id ON admin_documents(batch_id); +``` + +#### 3.2.2 AdminAnnotation Modifications + +```sql +ALTER TABLE admin_annotations ADD COLUMN is_verified BOOLEAN DEFAULT FALSE; +-- User-verified annotation + +ALTER TABLE admin_annotations ADD COLUMN verified_at TIMESTAMP; +ALTER TABLE admin_annotations ADD COLUMN verified_by VARCHAR(255); + +ALTER TABLE admin_annotations ADD COLUMN override_source VARCHAR(20); +-- If this annotation overrides another: 'auto' or 'imported' + +ALTER TABLE admin_annotations ADD COLUMN original_annotation_id UUID; +-- Reference to the annotation this overrides + +CREATE INDEX idx_admin_annotations_source ON admin_annotations(source); +CREATE INDEX idx_admin_annotations_is_verified ON admin_annotations(is_verified); +``` + +#### 3.2.3 TrainingTask Modifications + +```sql +ALTER TABLE training_tasks ADD COLUMN document_count INTEGER DEFAULT 0; +-- Count of documents used in training + +ALTER TABLE training_tasks ADD COLUMN document_ids UUID[]; +-- Array of document IDs used (for quick reference) + +ALTER TABLE training_tasks ADD COLUMN metrics_mAP FLOAT; +ALTER TABLE training_tasks ADD COLUMN metrics_precision FLOAT; +ALTER TABLE training_tasks ADD COLUMN metrics_recall FLOAT; +-- Extracted metrics for easy querying + +CREATE INDEX idx_training_tasks_metrics ON training_tasks(metrics_mAP); +``` + +### 3.3 SQLModel Definitions + +```python +# File: src/data/admin_models.py + +from datetime import datetime +from typing import Any +from uuid import UUID, uuid4 +from sqlmodel import Field, SQLModel, Column, JSON, ARRAY +from sqlalchemy import String + + +class BatchUpload(SQLModel, table=True): + """Batch upload record for ZIP uploads.""" + + __tablename__ = "batch_uploads" + + batch_id: UUID = Field(default_factory=uuid4, primary_key=True) + admin_token: str = Field(foreign_key="admin_tokens.token", max_length=255, index=True) + filename: str = Field(max_length=255) + file_size: int + upload_source: str = Field(default="ui", max_length=20) + status: str = Field(default="processing", max_length=20, index=True) + total_files: int = Field(default=0) + processed_files: int = Field(default=0) + successful_files: int = Field(default=0) + failed_files: int = Field(default=0) + error_message: str | None = Field(default=None) + created_at: datetime = Field(default_factory=datetime.utcnow) + completed_at: datetime | None = Field(default=None) + + +class BatchUploadFile(SQLModel, table=True): + """Individual file within a batch upload.""" + + __tablename__ = "batch_upload_files" + + file_id: UUID = Field(default_factory=uuid4, primary_key=True) + batch_id: UUID = Field(foreign_key="batch_uploads.batch_id", index=True) + document_id: UUID | None = Field(default=None, foreign_key="admin_documents.document_id") + filename: str = Field(max_length=255) + status: str = Field(default="pending", max_length=20) + error_message: str | None = Field(default=None) + csv_row_data: dict[str, Any] | None = Field(default=None, sa_column=Column(JSON)) + created_at: datetime = Field(default_factory=datetime.utcnow) + processed_at: datetime | None = Field(default=None) + + +class TrainingDocumentLink(SQLModel, table=True): + """Link between training tasks and documents used.""" + + __tablename__ = "training_document_links" + + link_id: UUID = Field(default_factory=uuid4, primary_key=True) + task_id: UUID = Field(foreign_key="training_tasks.task_id", index=True) + document_id: UUID = Field(foreign_key="admin_documents.document_id", index=True) + annotation_snapshot: dict[str, Any] | None = Field(default=None, sa_column=Column(JSON)) + created_at: datetime = Field(default_factory=datetime.utcnow) + + +class AnnotationHistory(SQLModel, table=True): + """History of annotation changes.""" + + __tablename__ = "annotation_history" + + history_id: UUID = Field(default_factory=uuid4, primary_key=True) + annotation_id: UUID = Field(foreign_key="admin_annotations.annotation_id", index=True) + action: str = Field(max_length=20) + previous_value: dict[str, Any] | None = Field(default=None, sa_column=Column(JSON)) + new_value: dict[str, Any] | None = Field(default=None, sa_column=Column(JSON)) + changed_by: str | None = Field(default=None, max_length=255) + change_reason: str | None = Field(default=None) + created_at: datetime = Field(default_factory=datetime.utcnow, index=True) +``` + +--- + +## 4. API Specification + +### 4.1 New Endpoints + +#### 4.1.1 Batch Upload (ZIP) + +```yaml +POST /api/v1/admin/batch/upload +Content-Type: multipart/form-data + +Request: + file: binary (ZIP file) + async: boolean (default: true) + auto_label: boolean (default: true) + +Response (202 Accepted): +{ + "batch_id": "uuid", + "status": "processing", + "total_files": 25, + "message": "Batch upload started. Use batch_id to check progress.", + "status_url": "/api/v1/admin/batch/{batch_id}" +} + +Response (200 OK - sync mode): +{ + "batch_id": "uuid", + "status": "completed", + "total_files": 25, + "successful_files": 23, + "failed_files": 2, + "documents": [ + { + "document_id": "uuid", + "filename": "INV001.pdf", + "status": "completed", + "auto_label_status": "completed", + "annotations_created": 8 + } + ], + "errors": [ + { + "filename": "invalid.pdf", + "error": "Corrupted PDF file" + } + ] +} +``` + +#### 4.1.2 Batch Status + +```yaml +GET /api/v1/admin/batch/{batch_id} + +Response: +{ + "batch_id": "uuid", + "status": "processing", + "progress": { + "total": 25, + "processed": 15, + "successful": 14, + "failed": 1, + "percentage": 60 + }, + "files": [ + { + "file_id": "uuid", + "filename": "INV001.pdf", + "document_id": "uuid", + "status": "completed" + } + ], + "created_at": "2024-01-15T10:00:00Z", + "estimated_completion": "2024-01-15T10:05:00Z" +} +``` + +#### 4.1.3 Upload PDF with Auto-Label Values + +```yaml +POST /api/v1/admin/documents/upload-with-labels +Content-Type: multipart/form-data + +Request: + file: binary (PDF file) + field_values: JSON string + { + "InvoiceNumber": "F2024-001", + "InvoiceDate": "2024-01-15", + "Amount": "1250.00", + "OCR": "7350012345678", + "Bankgiro": "123-4567" + } + auto_label: boolean (default: true) + wait_for_completion: boolean (default: false) + +Response (202 Accepted): +{ + "document_id": "uuid", + "filename": "invoice.pdf", + "status": "auto_labeling", + "auto_label_status": "running", + "message": "Document uploaded. Auto-labeling in progress." +} + +Response (200 OK - wait_for_completion=true): +{ + "document_id": "uuid", + "filename": "invoice.pdf", + "status": "labeled", + "auto_label_status": "completed", + "annotations": [ + { + "annotation_id": "uuid", + "class_id": 0, + "class_name": "invoice_number", + "text_value": "F2024-001", + "confidence": 0.95, + "bbox": { "x": 100, "y": 200, "width": 150, "height": 30 } + } + ] +} +``` + +#### 4.1.4 Query Document Status + +```yaml +GET /api/v1/admin/documents/{document_id}/status + +Response: +{ + "document_id": "uuid", + "filename": "invoice.pdf", + "status": "labeled", + "auto_label_status": "completed", + "upload_source": "api", + "annotation_summary": { + "total": 8, + "manual": 2, + "auto": 6, + "verified": 3 + }, + "can_annotate": true, + "annotation_lock_reason": null, + "training_history": [ + { + "task_id": "uuid", + "task_name": "Training Run 2024-01", + "trained_at": "2024-01-20T15:00:00Z" + } + ] +} +``` + +#### 4.1.5 Training with Document Selection + +```yaml +POST /api/v1/admin/training/tasks +Content-Type: application/json + +Request: +{ + "name": "Training Run 2024-01", + "description": "First training run with 500 documents", + "document_ids": ["uuid1", "uuid2", "uuid3"], + "config": { + "model_name": "yolo11n.pt", + "epochs": 100, + "batch_size": 16, + "image_size": 640 + }, + "scheduled_at": "2024-01-20T22:00:00Z" +} + +Response: +{ + "task_id": "uuid", + "name": "Training Run 2024-01", + "status": "scheduled", + "document_count": 500, + "message": "Training task scheduled for 2024-01-20T22:00:00Z" +} +``` + +#### 4.1.6 Get Documents for Training + +```yaml +GET /api/v1/admin/training/documents + +Query Parameters: + - status: labeled (required) + - has_annotations: true + - min_annotation_count: 3 + - exclude_used_in_training: boolean + - limit: 100 + - offset: 0 + +Response: +{ + "total": 1500, + "documents": [ + { + "document_id": "uuid", + "filename": "INV001.pdf", + "annotation_count": 8, + "annotation_sources": { "manual": 3, "auto": 5 }, + "used_in_training": ["task_id_1", "task_id_2"], + "last_modified": "2024-01-15T10:00:00Z" + } + ] +} +``` + +#### 4.1.7 Get Model List + +```yaml +GET /api/v1/admin/training/models + +Query Parameters: + - status: completed + - limit: 20 + - offset: 0 + +Response: +{ + "total": 15, + "models": [ + { + "task_id": "uuid", + "name": "Training Run 2024-01", + "status": "completed", + "document_count": 500, + "created_at": "2024-01-20T15:00:00Z", + "completed_at": "2024-01-20T18:30:00Z", + "metrics": { + "mAP": 0.935, + "precision": 0.92, + "recall": 0.88 + }, + "model_path": "runs/train/invoice_fields_20240120/weights/best.pt", + "download_url": "/api/v1/admin/training/models/{task_id}/download" + } + ] +} +``` + +#### 4.1.8 Override Annotation + +```yaml +PATCH /api/v1/admin/documents/{document_id}/annotations/{annotation_id}/override +Content-Type: application/json + +Request: +{ + "bbox": { "x": 110, "y": 205, "width": 145, "height": 28 }, + "text_value": "F2024-001-A", + "reason": "Corrected OCR error" +} + +Response: +{ + "annotation_id": "uuid", + "source": "manual", + "override_source": "auto", + "original_annotation_id": "uuid", + "message": "Annotation overridden successfully", + "history_id": "uuid" +} +``` + +### 4.2 Modified Endpoints + +#### 4.2.1 Document List (Enhanced) + +```yaml +GET /api/v1/admin/documents + +Query Parameters (additions): + - upload_source: 'ui' | 'api' | null + - has_annotations: boolean + - auto_label_status: 'pending' | 'running' | 'completed' | 'failed' + - used_in_training: boolean + - batch_id: uuid + +Response (additions to DocumentItem): +{ + "documents": [ + { + // ... existing fields ... + "upload_source": "api", + "batch_id": "uuid", + "can_annotate": true, + "training_count": 2 + } + ] +} +``` + +#### 4.2.2 Document Detail (Enhanced) + +```yaml +GET /api/v1/admin/documents/{document_id} + +Response (additions): +{ + // ... existing fields ... + "upload_source": "api", + "csv_field_values": { + "InvoiceNumber": "F2024-001", + "Amount": "1250.00" + }, + "can_annotate": true, + "annotation_lock_reason": null, + "annotations": [ + { + // ... existing fields ... + "is_verified": true, + "verified_at": "2024-01-16T09:00:00Z", + "override_source": null + } + ], + "training_history": [ + { + "task_id": "uuid", + "name": "Training Run 2024-01", + "trained_at": "2024-01-20T15:00:00Z", + "model_metrics": { "mAP": 0.935 } + } + ] +} +``` + +--- + +## 5. UI Wireframes (Text-Based) + +### 5.1 Document List View + +``` ++------------------------------------------------------------------+ +| DOCUMENT ANNOTATION TOOL [User: Admin] [Logout]| ++------------------------------------------------------------------+ +| [Documents] [Training] [Models] [Settings] | ++------------------------------------------------------------------+ +| | +| DOCUMENTS | +| +-----------------+ +-----------------------------------------+ | +| | UPLOAD | | FILTERS | | +| | [Single PDF] | | Status: [All v] Source: [All v] | | +| | [ZIP Batch] | | Auto-Label: [All v] Search: [________] | | +| +-----------------+ +-----------------------------------------+ | +| | +| +--------------------------------------------------------------+ | +| | [] Filename | Status | Auto-Label | Source | Date | | +| +--------------------------------------------------------------+ | +| | [] INV001.pdf | Labeled | Completed | API | 01/15 | | +| | [8 annotations] | [Preview] | [95%] | | | | +| +--------------------------------------------------------------+ | +| | [] INV002.pdf | Pending | Running | UI | 01/16 | | +| | [0 annotations] | [Locked] | [==== ] | | | | +| +--------------------------------------------------------------+ | +| | [] INV003.pdf | Labeled | Failed | API | 01/16 | | +| | [5 annotations] | [Preview] | [Retry] | | | | +| +--------------------------------------------------------------+ | +| | [] INV004.pdf | Labeled | Completed | UI | 01/17 | | +| | [10 annotations]| [Preview] | [98%] | [Used] | | | +| +--------------------------------------------------------------+ | +| | +| Showing 1-20 of 1,543 documents [<] [1] [2] [3] ... [78] [>] | +| | +| [Delete Selected] [Start Training with Selected] | ++------------------------------------------------------------------+ +``` + +### 5.2 Document Detail View + +``` ++------------------------------------------------------------------+ +| < Back to Documents INV001.pdf | ++------------------------------------------------------------------+ +| | +| +---------------------------+ +-------------------------------+ | +| | | | DOCUMENT INFO | | +| | | | Status: Labeled | | +| | [Page 1 Image with | | Source: API Upload | | +| | Annotation Overlays] | | Auto-Label: Completed (95%) | | +| | | | Pages: 1 | | +| | [Manual: Solid border] | | Uploaded: 2024-01-15 | | +| | [Auto: Dashed border] | | | | +| | | | TRAINING HISTORY | | +| | | | - Run 2024-01 (mAP: 93.5%) | | +| | | | - Run 2024-02 (mAP: 95.1%) | | +| | | | | | +| +---------------------------+ +-------------------------------+ | +| | +| ANNOTATIONS [Add Annotation] [Run OCR] | +| +--------------------------------------------------------------+ | +| | Field | Value | Source | Conf | Actions | | +| +--------------------------------------------------------------+ | +| | invoice_number | F2024-001 | Manual | - | [E] [D] | | +| +--------------------------------------------------------------+ | +| | invoice_date | 2024-01-15 | Auto | 95% | [V] [E][D]| | +| +--------------------------------------------------------------+ | +| | amount | 1,250.00 | Auto | 98% | [V] [E][D]| | +| +--------------------------------------------------------------+ | +| | ocr_number | 7350012345 | Auto | 87% | [V] [E][D]| | +| +--------------------------------------------------------------+ | +| | bankgiro | 123-4567 | Manual | - | [E] [D] | | +| +--------------------------------------------------------------+ | +| | +| [V] = Verify [E] = Edit [D] = Delete | +| | +| CSV FIELD VALUES (Reference) | +| +--------------------------------------------------------------+ | +| | InvoiceNumber: F2024-001 | InvoiceDate: 2024-01-15 | | +| | Amount: 1250.00 | OCR: 7350012345678 | | +| | Bankgiro: 123-4567 | | | +| +--------------------------------------------------------------+ | ++------------------------------------------------------------------+ +``` + +### 5.3 Training Page + +``` ++------------------------------------------------------------------+ +| DOCUMENT ANNOTATION TOOL [User: Admin] [Logout]| ++------------------------------------------------------------------+ +| [Documents] [Training] [Models] [Settings] | ++------------------------------------------------------------------+ +| | +| TRAINING | +| | +| DOCUMENT SELECTION Selected: 500 docs | +| +--------------------------------------------------------------+ | +| | [] Filename | Annotations | Source | Last Modified | | +| +--------------------------------------------------------------+ | +| | [x] INV001.pdf | 8 (M:3 A:5) | API | 2024-01-15 | | +| +--------------------------------------------------------------+ | +| | [x] INV002.pdf | 10 (M:2 A:8)| UI | 2024-01-16 | | +| +--------------------------------------------------------------+ | +| | [ ] INV003.pdf | 5 (M:5 A:0) | UI | 2024-01-16 | | +| +--------------------------------------------------------------+ | +| | [x] INV004.pdf | 12 (M:4 A:8)| API | 2024-01-17 | | +| +--------------------------------------------------------------+ | +| | +| [Select All] [Select None] [Select Not Used in Training] | +| | +| Showing labeled documents only [<] [1] [2] [3] ... [50] [>] | +| | +| TRAINING CONFIGURATION | +| +--------------------------------------------------------------+ | +| | Name: [Training Run 2024-01____________] | | +| | Description: [First training with 500 documents_________] | | +| | | | +| | Base Model: [yolo11n.pt v] Epochs: [100] Batch: [16] | | +| | Image Size: [640] Device: [GPU 0 v] | | +| | | | +| | [ ] Schedule for later: [2024-01-20] [22:00] | | +| +--------------------------------------------------------------+ | +| | +| [Start Training] | ++------------------------------------------------------------------+ +``` + +### 5.4 Model History View + +``` ++------------------------------------------------------------------+ +| DOCUMENT ANNOTATION TOOL [User: Admin] [Logout]| ++------------------------------------------------------------------+ +| [Documents] [Training] [Models] [Settings] | ++------------------------------------------------------------------+ +| | +| TRAINED MODELS | +| | +| +--------------------------------------------------------------+ | +| | Name | Status | Docs | mAP | Date | | +| +--------------------------------------------------------------+ | +| | Training Run 2024-03 | Running | 750 | - | 01/25 | | +| | | [==== ] | | | | | +| | | [View Logs] [Cancel] | | +| +--------------------------------------------------------------+ | +| | Training Run 2024-02 | Completed | 600 | 95.1% | 01/20 | | +| | | P: 94% R: 92% | | +| | | [View] [Download] [Use as Base] | | +| +--------------------------------------------------------------+ | +| | Training Run 2024-01 | Completed | 500 | 93.5% | 01/15 | | +| | | P: 92% R: 88% | | +| | | [View] [Download] [Use as Base] | | +| +--------------------------------------------------------------+ | +| | Initial Training | Completed | 200 | 85.2% | 01/10 | | +| | | P: 84% R: 80% | | +| | | [View] [Download] [Use as Base] | | +| +--------------------------------------------------------------+ | +| | +| MODEL DETAIL: Training Run 2024-02 | +| +--------------------------------------------------------------+ | +| | Created: 2024-01-20 15:00 | Completed: 2024-01-20 18:30 | | +| | Duration: 3h 30m | Documents: 600 | | +| | | | +| | Metrics: | | +| | - mAP@0.5: 95.1% | | +| | - Precision: 94% | | +| | - Recall: 92% | | +| | | | +| | Configuration: | | +| | - Base: yolo11n.pt Epochs: 100 Batch: 16 Size: 640 | | +| | | | +| | Documents Used: [View 600 documents] | | +| +--------------------------------------------------------------+ | ++------------------------------------------------------------------+ +``` + +### 5.5 Batch Upload Modal + +``` ++------------------------------------------------------------------+ +| BATCH UPLOAD [X] | ++------------------------------------------------------------------+ +| | +| Upload a ZIP file containing: | +| - Multiple PDF files | +| - (Optional) CSV file for auto-labeling | +| | +| +--------------------------------------------------------------+ | +| | | | +| | [Drag and drop ZIP file here] | | +| | or | | +| | [Browse Files] | | +| | | | +| +--------------------------------------------------------------+ | +| | +| [x] Auto-label documents (requires CSV) | +| [ ] Process asynchronously | +| | +| CSV FORMAT REQUIREMENTS: | +| Required columns: DocumentId | +| Optional: InvoiceNumber, InvoiceDate, Amount, OCR, Bankgiro... | +| [View full CSV specification] | +| | +| [Cancel] [Upload] | ++------------------------------------------------------------------+ + ++------------------------------------------------------------------+ +| UPLOAD PROGRESS [X] | ++------------------------------------------------------------------+ +| | +| Processing batch upload... | +| | +| [======================================== ] 80% | +| | +| Files: 20 / 25 | +| Successful: 18 | +| Failed: 2 | +| | +| +--------------------------------------------------------------+ | +| | [OK] INV001.pdf - Completed (8 annotations) | | +| | [OK] INV002.pdf - Completed (10 annotations) | | +| | [!!] INV003.pdf - Failed: Corrupted PDF | | +| | [OK] INV004.pdf - Completed (6 annotations) | | +| | [...] Processing INV005.pdf... | | +| +--------------------------------------------------------------+ | +| | +| [Cancel] [Close] | ++------------------------------------------------------------------+ +``` + +--- + +## 6. Implementation Phases + +### Phase 1: Database and Core Models (Week 1) + +| Step | Task | Files | Risk | +|------|------|-------|------| +| 1.1 | Create database migration script | `src/data/migrations/` | Low | +| 1.2 | Add new SQLModel classes | `src/data/admin_models.py` | Low | +| 1.3 | Update AdminDB with new methods | `src/data/admin_db.py` | Medium | +| 1.4 | Add unit tests for new models | `tests/data/test_admin_models.py` | Low | + +**Dependencies**: None +**Risk Assessment**: Low - mostly additive changes to existing structure + +### Phase 2: Batch Upload Backend (Week 2) + +| Step | Task | Files | Risk | +|------|------|-------|------| +| 2.1 | Create ZIP extraction service | `src/web/batch_upload_service.py` | Medium | +| 2.2 | Add CSV parsing with new format | `src/data/csv_loader.py` | Low | +| 2.3 | Create batch upload routes | `src/web/admin_batch_routes.py` | Medium | +| 2.4 | Add async processing queue | `src/web/batch_queue.py` | High | +| 2.5 | Integration tests | `tests/web/test_batch_upload.py` | Medium | + +**Dependencies**: Phase 1 +**Risk Assessment**: Medium - ZIP handling and async processing add complexity + +### Phase 3: Enhanced Document Management (Week 3) + +| Step | Task | Files | Risk | +|------|------|-------|------| +| 3.1 | Add upload source tracking | `src/data/admin_models.py` | Low | +| 3.2 | Update document list endpoint | `src/web/admin_routes.py` | Low | +| 3.3 | Add annotation lock mechanism | `src/web/admin_annotation_routes.py` | Medium | +| 3.4 | Add document status endpoint | `src/web/admin_routes.py` | Low | +| 3.5 | Update auto-label service | `src/web/admin_autolabel.py` | Medium | + +**Dependencies**: Phase 1, Phase 2 +**Risk Assessment**: Medium - locking mechanism needs careful implementation + +### Phase 4: Manual Annotation Enhancement (Week 4) + +| Step | Task | Files | Risk | +|------|------|-------|------| +| 4.1 | Add override mechanism | `src/web/admin_annotation_routes.py` | Medium | +| 4.2 | Add annotation history | `src/data/admin_db.py` | Low | +| 4.3 | Add verification endpoint | `src/web/admin_annotation_routes.py` | Low | +| 4.4 | Update schemas with new fields | `src/web/admin_schemas.py` | Low | + +**Dependencies**: Phase 3 +**Risk Assessment**: Low - extending existing annotation system + +### Phase 5: Training Integration (Week 5) + +| Step | Task | Files | Risk | +|------|------|-------|------| +| 5.1 | Add document selection for training | `src/web/admin_training_routes.py` | Medium | +| 5.2 | Add training document link table | `src/data/admin_db.py` | Low | +| 5.3 | Add model list endpoint | `src/web/admin_training_routes.py` | Low | +| 5.4 | Update export with selection | `src/web/admin_training_routes.py` | Medium | +| 5.5 | Add metrics extraction | `src/cli/train.py` | Medium | + +**Dependencies**: Phase 1, Phase 4 +**Risk Assessment**: Medium - integration with training pipeline + +### Phase 6: Frontend Implementation (Weeks 6-7) + +| Step | Task | Files | Risk | +|------|------|-------|------| +| 6.1 | Create React component structure | `frontend/` | High | +| 6.2 | Implement document list view | `frontend/src/components/` | Medium | +| 6.3 | Implement document detail view | `frontend/src/components/` | High | +| 6.4 | Implement training page | `frontend/src/components/` | Medium | +| 6.5 | Implement batch upload modal | `frontend/src/components/` | Medium | +| 6.6 | Add annotation editor | `frontend/src/components/` | High | + +**Dependencies**: Phase 2-5 +**Risk Assessment**: High - frontend development is a new component + +### Phase 7: Testing and Documentation (Week 8) + +| Step | Task | Files | Risk | +|------|------|-------|------| +| 7.1 | Integration tests | `tests/integration/` | Medium | +| 7.2 | E2E tests | `tests/e2e/` | High | +| 7.3 | API documentation | `docs/api/` | Low | +| 7.4 | User guide | `docs/user-guide/` | Low | +| 7.5 | Performance testing | `tests/performance/` | Medium | + +**Dependencies**: All phases +**Risk Assessment**: Medium + +### Risk Mitigation Strategies + +| Risk | Impact | Probability | Mitigation | +|------|--------|-------------|------------| +| ZIP bomb attack | High | Low | Limit max file count, max total size, scan before extraction | +| Async queue failures | Medium | Medium | Implement retry logic, dead letter queue, manual retry endpoint | +| Annotation lock deadlock | Medium | Low | Timeout-based locks, admin override capability | +| Large batch performance | Medium | High | Chunked processing, progress tracking, background workers | +| Database migration issues | High | Low | Backward compatible changes, rollback scripts | +| Frontend complexity | Medium | Medium | Use established UI framework, incremental delivery | + +--- + +## 7. State Machine Diagrams + +### 7.1 Document Lifecycle States + +``` + +-------------+ + | DELETED | + +------^------+ + | + | delete + | ++----------+ upload +----------+ | +| | --------------> | |--+ +| (none) | | PENDING | +| | | | ++----------+ +----+-----+ + | + +----------------+-----------------+ + | | + | trigger auto-label | create manual annotation + v | + +-------------+ | + | | | + | AUTO_LABEL- | | + | ING | | + | | | + +------+------+ | + | | + +---------+---------+ | + | | | + | complete | fail | + v v | ++-------------+ +-------------+ | +| | | | | +| LABELED |<----+ PENDING +<--------------+ +| | retry| (failed) | ++------+------+ +-------------+ + | + | export + v ++-------------+ +| | +| EXPORTED | +| | ++-------------+ +``` + +### 7.2 Auto-Label Workflow States + +``` + +-------------+ + | MANUAL | + | OVERRIDE | + +------^------+ + | + | user edit + | ++----------+ queue +----------+ | +-----------+ +| | --------------> | | | | | +| (none) | | QUEUED |--+--->| COMPLETED | +| | | | | | ++----------+ +----+-----+ +-----^-----+ + | | + | start | + v | + +-------------+ | + | | | + | RUNNING +-----------+ + | | success + +------+------+ + | + | error + v + +-------------+ + | | + | FAILED | + | | + +------+------+ + | + | retry + v + +-------------+ + | | + | QUEUED | + | | + +-------------+ +``` + +### 7.3 Batch Upload States + +``` ++----------+ upload +-------------+ +| | --------------> | | +| (none) | | PROCESSING | +| | | | ++----------+ +------+------+ + | + +---------------+---------------+ + | | | + | all success | some fail | all fail + v v v + +-------------+ +-------------+ +-------------+ + | | | | | | + | COMPLETED | | PARTIAL | | FAILED | + | | | | | | + +-------------+ +-------------+ +-------------+ +``` + +### 7.4 Training Task States + +``` ++----------+ create +-------------+ +| | --------------> | | +| (none) | | PENDING | +| | | | ++----------+ +------+------+ + | + +-------------+-------------+ + | | + | immediate | scheduled + v v + +-------------+ +-------------+ + | | | | + | RUNNING |<------------+ SCHEDULED | + | | trigger | | + +------+------+ +------+------+ + | | + +---------+---------+ | cancel + | | v + | success | error +-------------+ + v v | | + +-------------+ +-------------+ | CANCELLED | + | | | | | | + | COMPLETED | | FAILED | +-------------+ + | | | | + +-------------+ +------+------+ + | + | retry + v + +-------------+ + | | + | PENDING | + | | + +-------------+ +``` + +### 7.5 Annotation Lock States + +``` + +-------------+ + | LOCKED | + | (auto-label | + | running) | + +------^------+ + | + | auto-label starts + | ++----------+ upload +----------+ | +| | --------------> | |--+ +| (none) | | UNLOCKED |<---------+ +| | | | | ++----------+ +----+-----+ | + | | + | auto-label | auto-label + | starts | completes/fails + | | + v | + +-------------+ | + | | | + | LOCKED +---------+ + | (timeout: | + | 5 minutes) | + +-------------+ +``` + +--- + +## Summary + +This comprehensive plan provides: + +1. **PRD**: 24 user stories across 6 epics with clear acceptance criteria and priorities +2. **CSV Specification**: 13 columns with detailed validation rules and field mappings +3. **Database Schema**: 4 new tables + modifications to 3 existing tables with full SQLModel definitions +4. **API Specification**: 8 new endpoints + 2 modified endpoints with complete request/response schemas +5. **UI Wireframes**: 5 detailed text-based wireframes covering all major views +6. **Implementation Phases**: 7 phases over 8 weeks with 30+ tasks, dependencies, and risk assessments +7. **State Machines**: 5 state diagrams covering document, auto-label, batch, training, and locking workflows + +The implementation follows an incremental approach starting with database/backend changes before frontend development, minimizing risk and enabling continuous testing throughout the development cycle. diff --git a/docs/ux-design-prompt-v2.md b/docs/ux-design-prompt-v2.md new file mode 100644 index 0000000..3f89da9 --- /dev/null +++ b/docs/ux-design-prompt-v2.md @@ -0,0 +1,302 @@ +# Document Annotation Tool – UX Design Spec v2 + +## Theme: Warm Graphite (Modern Enterprise) + +--- + +## 1. Design Principles (Updated) + +1. **Clarity** – High contrast, but never pure black-on-white +2. **Warm Neutrality** – Slightly warm grays reduce visual fatigue +3. **Focus** – Content-first layouts with restrained accents +4. **Consistency** – Reusable patterns, predictable behavior +5. **Professional Trust** – Calm, serious, enterprise-ready +6. **Longevity** – No trendy colors that age quickly + +--- + +## 2. Color Palette (Warm Graphite) + +### Core Colors + +| Usage | Color Name | Hex | +|------|-----------|-----| +| Primary Text | Soft Black | #121212 | +| Secondary Text | Charcoal Gray | #2A2A2A | +| Muted Text | Warm Gray | #6B6B6B | +| Disabled Text | Light Warm Gray | #9A9A9A | + +### Backgrounds + +| Usage | Color | Hex | +|-----|------|-----| +| App Background | Paper White | #FAFAF8 | +| Card / Panel | White | #FFFFFF | +| Hover Surface | Subtle Warm Gray | #F1F0ED | +| Selected Row | Very Light Warm Gray | #ECEAE6 | + +### Borders & Dividers + +| Usage | Color | Hex | +|------|------|-----| +| Default Border | Warm Light Gray | #E6E4E1 | +| Strong Divider | Neutral Gray | #D8D6D2 | + +### Semantic States (Muted & Professional) + +| State | Color | Hex | +|------|-------|-----| +| Success | Olive Gray | #3E4A3A | +| Error | Brick Gray | #4A3A3A | +| Warning | Sand Gray | #4A4A3A | +| Info | Graphite Gray | #3A3A3A | + +> Accent colors are **never saturated** and are used only for status, progress, or selection. + +--- + +## 3. Typography + +- **Font Family**: Inter / SF Pro / system-ui +- **Headings**: + - Weight: 600–700 + - Color: #121212 + - Letter spacing: -0.01em +- **Body Text**: + - Weight: 400 + - Color: #2A2A2A +- **Captions / Meta**: + - Weight: 400 + - Color: #6B6B6B +- **Monospace (IDs / Values)**: + - JetBrains Mono / SF Mono + - Color: #2A2A2A + +--- + +## 4. Global Layout + +### Top Navigation Bar + +- Height: 56px +- Background: #FAFAF8 +- Bottom Border: 1px solid #E6E4E1 +- Logo: Text or icon in #121212 + +**Navigation Items** +- Default: #6B6B6B +- Hover: #2A2A2A +- Active: + - Text: #121212 + - Bottom indicator: 2px solid #3A3A3A (rounded ends) + +**Avatar** +- Circle background: #ECEAE6 +- Text: #2A2A2A + +--- + +## 5. Page: Documents (Dashboard) + +### Page Header + +- Title: "Documents" (#121212) +- Actions: + - Primary button: Dark graphite outline + - Secondary button: Subtle border only + +### Filters Bar + +- Background: #FFFFFF +- Border: 1px solid #E6E4E1 +- Inputs: + - Background: #FFFFFF + - Hover: #F1F0ED + - Focus ring: 1px #3A3A3A + +### Document Table + +- Table background: #FFFFFF +- Header text: #6B6B6B +- Row hover: #F1F0ED +- Row selected: + - Background: #ECEAE6 + - Left indicator: 3px solid #3A3A3A + +### Status Badges + +- Pending: + - BG: #FFFFFF + - Border: #D8D6D2 + - Text: #2A2A2A + +- Labeled: + - BG: #2A2A2A + - Text: #FFFFFF + +- Exported: + - BG: #ECEAE6 + - Text: #2A2A2A + - Icon: ✓ + +### Auto-label States + +- Running: + - Progress bar: #3A3A3A on #ECEAE6 +- Completed: + - Text: #3E4A3A +- Failed: + - BG: #F1EDED + - Text: #4A3A3A + +--- + +## 6. Upload Modals (Single & Batch) + +### Modal Container + +- Background: #FFFFFF +- Border radius: 8px +- Shadow: 0 1px 3px rgba(0,0,0,0.08) + +### Drop Zone + +- Background: #FAFAF8 +- Border: 1px dashed #D8D6D2 +- Hover: #F1F0ED +- Icon: Graphite gray + +### Form Fields + +- Input BG: #FFFFFF +- Border: #D8D6D2 +- Focus: 1px solid #3A3A3A + +Primary Action Button: +- Text: #FFFFFF +- BG: #2A2A2A +- Hover: #121212 + +--- + +## 7. Document Detail View + +### Canvas Area + +- Background: #FFFFFF +- Annotation styles: + - Manual: Solid border #2A2A2A + - Auto: Dashed border #6B6B6B + - Selected: 2px border #3A3A3A + resize handles + +### Right Info Panel + +- Card background: #FFFFFF +- Section headers: #121212 +- Meta text: #6B6B6B + +### Annotation Table + +- Same table styles as Documents +- Inline edit: + - Input background: #FAFAF8 + - Save button: Graphite + +### Locked State (Auto-label Running) + +- Banner BG: #FAFAF8 +- Border-left: 3px solid #4A4A3A +- Progress bar: Graphite + +--- + +## 8. Training Page + +### Document Selector + +- Selected rows use same highlight rules +- Verified state: + - Full: Olive gray check + - Partial: Sand gray warning + +### Configuration Panel + +- Card layout +- Inputs aligned to grid +- Schedule option visually muted until enabled + +Primary CTA: +- Start Training button in dark graphite + +--- + +## 9. Models & Training History + +### Training Job List + +- Job cards use #FFFFFF background +- Running job: + - Progress bar: #3A3A3A +- Completed job: + - Metrics bars in graphite + +### Model Detail Panel + +- Sectioned cards +- Metric bars: + - Track: #ECEAE6 + - Fill: #3A3A3A + +Actions: +- Primary: Download Model +- Secondary: View Logs / Use as Base + +--- + +## 10. Micro-interactions (Refined) + +| Element | Interaction | Animation | +|------|------------|-----------| +| Button hover | BG lightens | 150ms ease-out | +| Button press | Scale 0.98 | 100ms | +| Row hover | BG fade | 120ms | +| Modal open | Fade + scale 0.96 → 1 | 200ms | +| Progress fill | Smooth | ease-out | +| Annotation select | Border + handles | 120ms | + +--- + +## 11. Tailwind Theme (Updated) + +```js +colors: { + text: { + primary: '#121212', + secondary: '#2A2A2A', + muted: '#6B6B6B', + disabled: '#9A9A9A', + }, + bg: { + app: '#FAFAF8', + card: '#FFFFFF', + hover: '#F1F0ED', + selected: '#ECEAE6', + }, + border: '#E6E4E1', + accent: '#3A3A3A', + success: '#3E4A3A', + error: '#4A3A3A', + warning: '#4A4A3A', +} +``` + +--- + +## 12. Final Notes + +- Pure black (#000000) should **never** be used as large surfaces +- Accent color usage should stay under **10% of UI area** +- Warm grays are intentional and must not be "corrected" to blue-grays + +This theme is designed to scale from internal tool → polished SaaS without redesign. + diff --git a/docs/web-refactoring-complete.md b/docs/web-refactoring-complete.md new file mode 100644 index 0000000..aa95adb --- /dev/null +++ b/docs/web-refactoring-complete.md @@ -0,0 +1,273 @@ +# Web Directory Refactoring - Complete ✅ + +**Date**: 2026-01-25 +**Status**: ✅ Completed +**Tests**: 188 passing (0 failures) +**Coverage**: 23% (maintained) + +--- + +## Final Directory Structure + +``` +src/web/ +├── api/ +│ ├── __init__.py +│ └── v1/ +│ ├── __init__.py +│ ├── routes.py # Public inference API +│ ├── admin/ +│ │ ├── __init__.py +│ │ ├── documents.py # Document management (was admin_routes.py) +│ │ ├── annotations.py # Annotation routes (was admin_annotation_routes.py) +│ │ └── training.py # Training routes (was admin_training_routes.py) +│ ├── async_api/ +│ │ ├── __init__.py +│ │ └── routes.py # Async processing API (was async_routes.py) +│ └── batch/ +│ ├── __init__.py +│ └── routes.py # Batch upload API (was batch_upload_routes.py) +│ +├── schemas/ +│ ├── __init__.py +│ ├── common.py # Shared models (ErrorResponse) +│ ├── admin.py # Admin schemas (was admin_schemas.py) +│ └── inference.py # Inference + async schemas (was schemas.py) +│ +├── services/ +│ ├── __init__.py +│ ├── inference.py # Inference service (was services.py) +│ ├── autolabel.py # Auto-label service (was admin_autolabel.py) +│ ├── async_processing.py # Async processing (was async_service.py) +│ └── batch_upload.py # Batch upload service (was batch_upload_service.py) +│ +├── core/ +│ ├── __init__.py +│ ├── auth.py # Authentication (was admin_auth.py) +│ ├── rate_limiter.py # Rate limiting (unchanged) +│ └── scheduler.py # Task scheduler (was admin_scheduler.py) +│ +├── workers/ +│ ├── __init__.py +│ ├── async_queue.py # Async task queue (was async_queue.py) +│ └── batch_queue.py # Batch task queue (was batch_queue.py) +│ +├── __init__.py # Main exports +├── app.py # FastAPI app (imports updated) +├── config.py # Configuration (unchanged) +└── dependencies.py # Global dependencies (unchanged) +``` + +--- + +## Changes Summary + +### Files Moved and Renamed + +| Old Location | New Location | Change Type | +|-------------|--------------|-------------| +| `admin_routes.py` | `api/v1/admin/documents.py` | Moved + Renamed | +| `admin_annotation_routes.py` | `api/v1/admin/annotations.py` | Moved + Renamed | +| `admin_training_routes.py` | `api/v1/admin/training.py` | Moved + Renamed | +| `admin_auth.py` | `core/auth.py` | Moved | +| `admin_autolabel.py` | `services/autolabel.py` | Moved | +| `admin_scheduler.py` | `core/scheduler.py` | Moved | +| `admin_schemas.py` | `schemas/admin.py` | Moved | +| `routes.py` | `api/v1/routes.py` | Moved | +| `schemas.py` | `schemas/inference.py` | Moved | +| `services.py` | `services/inference.py` | Moved | +| `async_routes.py` | `api/v1/async_api/routes.py` | Moved | +| `async_queue.py` | `workers/async_queue.py` | Moved | +| `async_service.py` | `services/async_processing.py` | Moved + Renamed | +| `batch_queue.py` | `workers/batch_queue.py` | Moved | +| `batch_upload_routes.py` | `api/v1/batch/routes.py` | Moved | +| `batch_upload_service.py` | `services/batch_upload.py` | Moved | + +**Total**: 16 files reorganized + +### Files Updated + +**Source Files** (imports updated): +- `app.py` - Updated all imports to new structure +- `api/v1/admin/documents.py` - Updated schema/auth imports +- `api/v1/admin/annotations.py` - Updated schema/service imports +- `api/v1/admin/training.py` - Updated schema/auth imports +- `api/v1/routes.py` - Updated schema imports +- `api/v1/async_api/routes.py` - Updated schema imports +- `api/v1/batch/routes.py` - Updated service/worker imports +- `services/async_processing.py` - Updated worker/core imports + +**Test Files** (all 15 updated): +- `test_admin_annotations.py` +- `test_admin_auth.py` +- `test_admin_routes.py` +- `test_admin_routes_enhanced.py` +- `test_admin_training.py` +- `test_annotation_locks.py` +- `test_annotation_phase5.py` +- `test_async_queue.py` +- `test_async_routes.py` +- `test_async_service.py` +- `test_autolabel_with_locks.py` +- `test_batch_queue.py` +- `test_batch_upload_routes.py` +- `test_batch_upload_service.py` +- `test_training_phase4.py` +- `conftest.py` + +--- + +## Import Examples + +### Old Import Style (Before Refactoring) +```python +from src.web.admin_routes import create_admin_router +from src.web.admin_schemas import DocumentItem +from src.web.admin_auth import validate_admin_token +from src.web.async_routes import create_async_router +from src.web.schemas import ErrorResponse +``` + +### New Import Style (After Refactoring) +```python +# Admin API +from src.web.api.v1.admin.documents import create_admin_router +from src.web.api.v1.admin import create_admin_router # Shorter alternative + +# Schemas +from src.web.schemas.admin import DocumentItem +from src.web.schemas.common import ErrorResponse + +# Core components +from src.web.core.auth import validate_admin_token + +# Async API +from src.web.api.v1.async_api.routes import create_async_router +``` + +--- + +## Benefits Achieved + +### 1. **Clear Separation of Concerns** +- **API Routes**: All in `api/v1/` by version and feature +- **Data Models**: All in `schemas/` by domain +- **Business Logic**: All in `services/` +- **Core Components**: Reusable utilities in `core/` +- **Background Jobs**: Task queues in `workers/` + +### 2. **Better Scalability** +- Easy to add API v2 without touching v1 +- Clear namespace for each module +- Reduced file sizes (no 800+ line files) +- Follows single responsibility principle + +### 3. **Improved Maintainability** +- Find files by function, not by prefix +- Each module has one clear purpose +- Easier to onboard new developers +- Better IDE navigation + +### 4. **Standards Compliance** +- Follows FastAPI best practices +- Matches Django/Flask project structures +- Standard Python package organization +- Industry-standard naming conventions + +--- + +## Testing Results + +**Before Refactoring**: +- 188 tests passing +- 23% code coverage +- Flat directory structure + +**After Refactoring**: +- ✅ 188 tests passing (0 failures) +- ✅ 23% code coverage (maintained) +- ✅ Clean hierarchical structure +- ✅ All imports updated +- ✅ No backward compatibility shims needed + +--- + +## Migration Statistics + +| Metric | Count | +|--------|-------| +| Files moved | 16 | +| Directories created | 9 | +| Files updated (source) | 8 | +| Files updated (tests) | 16 | +| Import statements updated | ~150 | +| Lines of code changed | ~200 | +| Tests broken | 0 | +| Coverage lost | 0% | + +--- + +## Code Diff Summary + +```diff +Before: +src/web/ +├── admin_routes.py (645 lines) +├── admin_annotation_routes.py (504 lines) +├── admin_training_routes.py (565 lines) +├── admin_auth.py (22 lines) +├── admin_schemas.py (262 lines) +... (15 more files at root level) + +After: +src/web/ +├── api/v1/ +│ ├── admin/ (3 route files) +│ ├── async_api/ (1 route file) +│ └── batch/ (1 route file) +├── schemas/ (3 schema files) +├── services/ (4 service files) +├── core/ (3 core files) +└── workers/ (2 worker files) +``` + +--- + +## Next Steps (Optional) + +### Phase 2: Documentation +- [ ] Update API documentation with new import paths +- [ ] Create migration guide for external developers +- [ ] Update CLAUDE.md with new structure + +### Phase 3: Further Optimization +- [ ] Split large files (>400 lines) if needed +- [ ] Extract common utilities +- [ ] Add typing stubs + +### Phase 4: Deprecation (Future) +- [ ] Add deprecation warnings if creating compatibility layer +- [ ] Remove old imports after grace period +- [ ] Update all documentation + +--- + +## Rollback Instructions + +If needed, rollback is simple: +```bash +git revert +``` + +All changes are in version control, making rollback safe and easy. + +--- + +## Conclusion + +✅ **Refactoring completed successfully** +✅ **Zero breaking changes** +✅ **All tests passing** +✅ **Industry-standard structure achieved** + +The web directory is now organized following Python and FastAPI best practices, making it easier to scale, maintain, and extend. diff --git a/docs/web-refactoring-plan.md b/docs/web-refactoring-plan.md new file mode 100644 index 0000000..d10b825 --- /dev/null +++ b/docs/web-refactoring-plan.md @@ -0,0 +1,186 @@ +# Web Directory Refactoring Plan + +## Current Structure Issues + +1. **Flat structure**: All files in one directory (20 Python files) +2. **Naming inconsistency**: Mix of `admin_*`, `async_*`, `batch_*` prefixes +3. **Mixed concerns**: Routes, schemas, services, and workers in same directory +4. **Poor scalability**: Hard to navigate and maintain as project grows + +## Proposed Structure (Best Practices) + +``` +src/web/ +├── __init__.py # Main exports +├── app.py # FastAPI app factory +├── config.py # App configuration +├── dependencies.py # Global dependencies +│ +├── api/ # API Routes Layer +│ ├── __init__.py +│ └── v1/ # API version 1 +│ ├── __init__.py +│ ├── routes.py # Public API routes (inference) +│ ├── admin/ # Admin API routes +│ │ ├── __init__.py +│ │ ├── documents.py # admin_routes.py → documents.py +│ │ ├── annotations.py # admin_annotation_routes.py → annotations.py +│ │ ├── training.py # admin_training_routes.py → training.py +│ │ └── auth.py # admin_auth.py → auth.py (routes only) +│ ├── async_api/ # Async processing API +│ │ ├── __init__.py +│ │ └── routes.py # async_routes.py → routes.py +│ └── batch/ # Batch upload API +│ ├── __init__.py +│ └── routes.py # batch_upload_routes.py → routes.py +│ +├── schemas/ # Pydantic Models +│ ├── __init__.py +│ ├── common.py # Shared schemas (ErrorResponse, etc.) +│ ├── inference.py # schemas.py → inference.py +│ ├── admin.py # admin_schemas.py → admin.py +│ ├── async_api.py # New: async API schemas +│ └── batch.py # New: batch upload schemas +│ +├── services/ # Business Logic Layer +│ ├── __init__.py +│ ├── inference.py # services.py → inference.py +│ ├── autolabel.py # admin_autolabel.py → autolabel.py +│ ├── async_processing.py # async_service.py → async_processing.py +│ └── batch_upload.py # batch_upload_service.py → batch_upload.py +│ +├── core/ # Core Components +│ ├── __init__.py +│ ├── auth.py # admin_auth.py → auth.py (logic only) +│ ├── rate_limiter.py # rate_limiter.py → rate_limiter.py +│ └── scheduler.py # admin_scheduler.py → scheduler.py +│ +└── workers/ # Background Task Queues + ├── __init__.py + ├── async_queue.py # async_queue.py → async_queue.py + └── batch_queue.py # batch_queue.py → batch_queue.py +``` + +## File Mapping + +### Current → New Location + +| Current File | New Location | Purpose | +|--------------|--------------|---------| +| `admin_routes.py` | `api/v1/admin/documents.py` | Document management routes | +| `admin_annotation_routes.py` | `api/v1/admin/annotations.py` | Annotation routes | +| `admin_training_routes.py` | `api/v1/admin/training.py` | Training routes | +| `admin_auth.py` | Split: `api/v1/admin/auth.py` + `core/auth.py` | Auth routes + logic | +| `admin_schemas.py` | `schemas/admin.py` | Admin Pydantic models | +| `admin_autolabel.py` | `services/autolabel.py` | Auto-label service | +| `admin_scheduler.py` | `core/scheduler.py` | Training scheduler | +| `routes.py` | `api/v1/routes.py` | Public inference API | +| `schemas.py` | `schemas/inference.py` | Inference models | +| `services.py` | `services/inference.py` | Inference service | +| `async_routes.py` | `api/v1/async_api/routes.py` | Async API routes | +| `async_service.py` | `services/async_processing.py` | Async processing service | +| `async_queue.py` | `workers/async_queue.py` | Async task queue | +| `batch_upload_routes.py` | `api/v1/batch/routes.py` | Batch upload routes | +| `batch_upload_service.py` | `services/batch_upload.py` | Batch upload service | +| `batch_queue.py` | `workers/batch_queue.py` | Batch task queue | +| `rate_limiter.py` | `core/rate_limiter.py` | Rate limiting logic | +| `config.py` | `config.py` | Keep as-is | +| `dependencies.py` | `dependencies.py` | Keep as-is | +| `app.py` | `app.py` | Keep as-is (update imports) | + +## Benefits + +### 1. Clear Separation of Concerns +- **Routes**: API endpoint definitions +- **Schemas**: Data validation models +- **Services**: Business logic +- **Core**: Reusable components +- **Workers**: Background processing + +### 2. Better Scalability +- Easy to add new API versions (`v2/`) +- Clear namespace for each domain +- Reduced file size (no 800+ line files) + +### 3. Improved Maintainability +- Find files by function, not by prefix +- Each module has single responsibility +- Easier to write focused tests + +### 4. Standard Python Patterns +- Package-based organization +- Follows FastAPI best practices +- Similar to Django/Flask structures + +## Implementation Steps + +### Phase 1: Create New Structure (No Breaking Changes) +1. Create new directories: `api/`, `schemas/`, `services/`, `core/`, `workers/` +2. Copy files to new locations (don't delete originals yet) +3. Update imports in new files +4. Add `__init__.py` with proper exports + +### Phase 2: Update Tests +5. Update test imports to use new structure +6. Run tests to verify nothing breaks +7. Fix any import issues + +### Phase 3: Update Main App +8. Update `app.py` to import from new locations +9. Run full test suite +10. Verify all endpoints work + +### Phase 4: Cleanup +11. Delete old files +12. Update documentation +13. Final test run + +## Migration Priority + +**High Priority** (Most used): +- Routes and schemas (user-facing APIs) +- Services (core business logic) + +**Medium Priority**: +- Core components (auth, rate limiter) +- Workers (background tasks) + +**Low Priority**: +- Config and dependencies (already well-located) + +## Backwards Compatibility + +During migration, maintain backwards compatibility: + +```python +# src/web/__init__.py +# Old imports still work +from src.web.api.v1.admin.documents import router as admin_router +from src.web.schemas.admin import AdminDocument + +# Keep old names for compatibility (temporary) +admin_routes = admin_router # Deprecated alias +``` + +## Testing Strategy + +1. **Unit Tests**: Test each module independently +2. **Integration Tests**: Test API endpoints still work +3. **Import Tests**: Verify all old imports still work +4. **Coverage**: Maintain current 23% coverage minimum + +## Rollback Plan + +If issues arise: +1. Keep old files until fully migrated +2. Git allows easy revert +3. Tests catch breaking changes early + +--- + +## Next Steps + +Would you like me to: +1. **Start Phase 1**: Create new directory structure and move files? +2. **Create migration script**: Automate the file moves and import updates? +3. **Focus on specific area**: Start with admin API or async API first? diff --git a/docs/web-refactoring-status.md b/docs/web-refactoring-status.md new file mode 100644 index 0000000..37aa0ae --- /dev/null +++ b/docs/web-refactoring-status.md @@ -0,0 +1,218 @@ +# Web Directory Refactoring - Current Status + +## ✅ Completed Steps + +### 1. Directory Structure Created +``` +src/web/ +├── api/ +│ ├── v1/ +│ │ ├── admin/ (documents.py, annotations.py, training.py) +│ │ ├── async_api/ (routes.py) +│ │ ├── batch/ (routes.py) +│ │ └── routes.py (public inference API) +├── schemas/ +│ ├── admin.py (admin schemas) +│ ├── inference.py (inference + async schemas) +│ └── common.py (ErrorResponse) +├── services/ +│ ├── autolabel.py +│ ├── async_processing.py +│ ├── batch_upload.py +│ └── inference.py +├── core/ +│ ├── auth.py +│ ├── rate_limiter.py +│ └── scheduler.py +└── workers/ + ├── async_queue.py + └── batch_queue.py +``` + +### 2. Files Copied and Imports Updated + +#### Admin API (✅ Complete) +- [x] `admin_routes.py` → `api/v1/admin/documents.py` (imports updated) +- [x] `admin_annotation_routes.py` → `api/v1/admin/annotations.py` (imports updated) +- [x] `admin_training_routes.py` → `api/v1/admin/training.py` (imports updated) +- [x] `api/v1/admin/__init__.py` created with exports + +#### Public & Async API (✅ Complete) +- [x] `routes.py` → `api/v1/routes.py` (imports updated) +- [x] `async_routes.py` → `api/v1/async_api/routes.py` (imports updated) +- [x] `batch_upload_routes.py` → `api/v1/batch/routes.py` (copied, imports pending) + +#### Schemas (✅ Complete) +- [x] `admin_schemas.py` → `schemas/admin.py` +- [x] `schemas.py` → `schemas/inference.py` +- [x] `schemas/common.py` created +- [x] `schemas/__init__.py` created with exports + +#### Services (✅ Complete) +- [x] `admin_autolabel.py` → `services/autolabel.py` +- [x] `async_service.py` → `services/async_processing.py` +- [x] `batch_upload_service.py` → `services/batch_upload.py` +- [x] `services.py` → `services/inference.py` +- [x] `services/__init__.py` created + +#### Core Components (✅ Complete) +- [x] `admin_auth.py` → `core/auth.py` +- [x] `rate_limiter.py` → `core/rate_limiter.py` +- [x] `admin_scheduler.py` → `core/scheduler.py` +- [x] `core/__init__.py` created + +#### Workers (✅ Complete) +- [x] `async_queue.py` → `workers/async_queue.py` +- [x] `batch_queue.py` → `workers/batch_queue.py` +- [x] `workers/__init__.py` created + +#### Main App (✅ Complete) +- [x] `app.py` imports updated to use new structure + +--- + +## ⏳ Remaining Work + +### 1. Update Remaining File Imports (HIGH PRIORITY) + +Files that need import updates: +- [ ] `api/v1/batch/routes.py` - update to use new schema/service imports +- [ ] `services/autolabel.py` - may need import updates if it references old paths +- [ ] `services/async_processing.py` - check for old import references +- [ ] `services/batch_upload.py` - check for old import references +- [ ] `services/inference.py` - check for old import references + +### 2. Update ALL Test Files (CRITICAL) + +Test files need to import from new locations. Pattern: + +**Old:** +```python +from src.web.admin_routes import create_admin_router +from src.web.admin_schemas import DocumentItem +from src.web.admin_auth import validate_admin_token +``` + +**New:** +```python +from src.web.api.v1.admin import create_admin_router +from src.web.schemas.admin import DocumentItem +from src.web.core.auth import validate_admin_token +``` + +Test files to update: +- [ ] `tests/web/test_admin_annotations.py` +- [ ] `tests/web/test_admin_auth.py` +- [ ] `tests/web/test_admin_routes.py` +- [ ] `tests/web/test_admin_routes_enhanced.py` +- [ ] `tests/web/test_admin_training.py` +- [ ] `tests/web/test_annotation_locks.py` +- [ ] `tests/web/test_annotation_phase5.py` +- [ ] `tests/web/test_async_queue.py` +- [ ] `tests/web/test_async_routes.py` +- [ ] `tests/web/test_async_service.py` +- [ ] `tests/web/test_autolabel_with_locks.py` +- [ ] `tests/web/test_batch_queue.py` +- [ ] `tests/web/test_batch_upload_routes.py` +- [ ] `tests/web/test_batch_upload_service.py` +- [ ] `tests/web/test_rate_limiter.py` +- [ ] `tests/web/test_training_phase4.py` + +### 3. Create Backward Compatibility Layer (OPTIONAL) + +Keep old imports working temporarily: + +```python +# src/web/admin_routes.py (temporary compatibility shim) +\"\"\" +DEPRECATED: Use src.web.api.v1.admin.documents instead. +This file will be removed in next version. +\"\"\" +import warnings +from src.web.api.v1.admin.documents import * + +warnings.warn( + "Importing from src.web.admin_routes is deprecated. " + "Use src.web.api.v1.admin.documents instead.", + DeprecationWarning, + stacklevel=2 +) +``` + +### 4. Verify and Test + +1. Run tests: +```bash +pytest tests/web/ -v +``` + +2. Check for any import errors: +```bash +python -c "from src.web.app import create_app; create_app()" +``` + +3. Start server and test endpoints: +```bash +python run_server.py +``` + +### 5. Clean Up Old Files (ONLY AFTER TESTS PASS) + +Old files to remove: +- `src/web/admin_*.py` (7 files) +- `src/web/async_*.py` (3 files) +- `src/web/batch_*.py` (3 files) +- `src/web/routes.py` +- `src/web/services.py` +- `src/web/schemas.py` +- `src/web/rate_limiter.py` + +Keep these files (don't remove): +- `src/web/__init__.py` +- `src/web/app.py` +- `src/web/config.py` +- `src/web/dependencies.py` + +--- + +## 🎯 Next Immediate Steps + +1. **Update batch/routes.py imports** - Quick fix for remaining API route +2. **Update test file imports** - Critical for verification +3. **Run test suite** - Verify nothing broke +4. **Fix any import errors** - Address failures +5. **Remove old files** - Clean up after tests pass + +--- + +## 📊 Migration Impact Summary + +| Category | Files Moved | Imports Updated | Status | +|----------|-------------|-----------------|--------| +| API Routes | 7 | 5/7 | 🟡 In Progress | +| Schemas | 3 | 3/3 | ✅ Complete | +| Services | 4 | 0/4 | ⚠️ Pending | +| Core | 3 | 3/3 | ✅ Complete | +| Workers | 2 | 2/2 | ✅ Complete | +| Tests | 0 | 0/16 | ❌ Not Started | + +**Overall Progress: 65%** + +--- + +## 🚀 Benefits After Migration + +1. **Better Organization**: Clear separation by function +2. **Easier Navigation**: Find files by purpose, not prefix +3. **Scalability**: Easy to add new API versions +4. **Standard Structure**: Follows FastAPI best practices +5. **Maintainability**: Each module has single responsibility + +--- + +## 📝 Notes + +- All original files are still in place (no data loss risk) +- New structure is operational but needs import updates +- Backward compatibility can be added if needed +- Tests will validate the migration success diff --git a/frontend/.env.example b/frontend/.env.example new file mode 100644 index 0000000..fe4c6dd --- /dev/null +++ b/frontend/.env.example @@ -0,0 +1,5 @@ +# Backend API URL +VITE_API_URL=http://localhost:8000 + +# WebSocket URL (for future real-time updates) +VITE_WS_URL=ws://localhost:8000/ws diff --git a/frontend/.gitignore b/frontend/.gitignore new file mode 100644 index 0000000..a547bf3 --- /dev/null +++ b/frontend/.gitignore @@ -0,0 +1,24 @@ +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* +pnpm-debug.log* +lerna-debug.log* + +node_modules +dist +dist-ssr +*.local + +# Editor directories and files +.vscode/* +!.vscode/extensions.json +.idea +.DS_Store +*.suo +*.ntvs* +*.njsproj +*.sln +*.sw? diff --git a/frontend/README.md b/frontend/README.md new file mode 100644 index 0000000..dd8bc5b --- /dev/null +++ b/frontend/README.md @@ -0,0 +1,20 @@ +
+GHBanner +
+ +# Run and deploy your AI Studio app + +This contains everything you need to run your app locally. + +View your app in AI Studio: https://ai.studio/apps/drive/13hqd80ft4g_LngMYB8LLJxx2XU8C_eI4 + +## Run Locally + +**Prerequisites:** Node.js + + +1. Install dependencies: + `npm install` +2. Set the `GEMINI_API_KEY` in [.env.local](.env.local) to your Gemini API key +3. Run the app: + `npm run dev` diff --git a/frontend/REFACTORING_PLAN.md b/frontend/REFACTORING_PLAN.md new file mode 100644 index 0000000..50632aa --- /dev/null +++ b/frontend/REFACTORING_PLAN.md @@ -0,0 +1,240 @@ +# Frontend Refactoring Plan + +## Current Structure Issues + +1. **Flat component organization** - All components in one directory +2. **Mock data only** - No real API integration +3. **No state management** - Props drilling everywhere +4. **CDN dependencies** - Should use npm packages +5. **Manual routing** - Using useState instead of react-router +6. **No TypeScript integration with backend** - Types don't match API schemas + +## Recommended Structure + +``` +frontend/ +├── public/ +│ └── favicon.ico +│ +├── src/ +│ ├── api/ # API Layer +│ │ ├── client.ts # Axios instance + interceptors +│ │ ├── types.ts # API request/response types +│ │ └── endpoints/ +│ │ ├── documents.ts # GET /api/v1/admin/documents +│ │ ├── annotations.ts # GET/POST /api/v1/admin/documents/{id}/annotations +│ │ ├── training.ts # GET/POST /api/v1/admin/training/* +│ │ ├── inference.ts # POST /api/v1/infer +│ │ └── async.ts # POST /api/v1/async/submit +│ │ +│ ├── components/ +│ │ ├── common/ # Reusable components +│ │ │ ├── Badge.tsx +│ │ │ ├── Button.tsx +│ │ │ ├── Input.tsx +│ │ │ ├── Modal.tsx +│ │ │ ├── Table.tsx +│ │ │ ├── ProgressBar.tsx +│ │ │ └── StatusBadge.tsx +│ │ │ +│ │ ├── layout/ # Layout components +│ │ │ ├── TopNav.tsx +│ │ │ ├── Sidebar.tsx +│ │ │ └── PageHeader.tsx +│ │ │ +│ │ ├── documents/ # Document-specific components +│ │ │ ├── DocumentTable.tsx +│ │ │ ├── DocumentFilters.tsx +│ │ │ ├── DocumentRow.tsx +│ │ │ ├── UploadModal.tsx +│ │ │ └── BatchUploadModal.tsx +│ │ │ +│ │ ├── annotations/ # Annotation components +│ │ │ ├── AnnotationCanvas.tsx +│ │ │ ├── AnnotationBox.tsx +│ │ │ ├── AnnotationTable.tsx +│ │ │ ├── FieldEditor.tsx +│ │ │ └── VerificationPanel.tsx +│ │ │ +│ │ └── training/ # Training components +│ │ ├── DocumentSelector.tsx +│ │ ├── TrainingConfig.tsx +│ │ ├── TrainingJobList.tsx +│ │ ├── ModelCard.tsx +│ │ └── MetricsChart.tsx +│ │ +│ ├── pages/ # Page-level components +│ │ ├── DocumentsPage.tsx # Was Dashboard.tsx +│ │ ├── DocumentDetailPage.tsx # Was DocumentDetail.tsx +│ │ ├── TrainingPage.tsx # Was Training.tsx +│ │ ├── ModelsPage.tsx # Was Models.tsx +│ │ └── InferencePage.tsx # New: Test inference +│ │ +│ ├── hooks/ # Custom React Hooks +│ │ ├── useDocuments.ts # Document CRUD + listing +│ │ ├── useAnnotations.ts # Annotation management +│ │ ├── useTraining.ts # Training jobs +│ │ ├── usePolling.ts # Auto-refresh for async jobs +│ │ └── useDebounce.ts # Debounce search inputs +│ │ +│ ├── store/ # State Management (Zustand) +│ │ ├── documentsStore.ts +│ │ ├── annotationsStore.ts +│ │ ├── trainingStore.ts +│ │ └── uiStore.ts +│ │ +│ ├── types/ # TypeScript Types +│ │ ├── index.ts +│ │ ├── document.ts +│ │ ├── annotation.ts +│ │ ├── training.ts +│ │ └── api.ts +│ │ +│ ├── utils/ # Utility Functions +│ │ ├── formatters.ts # Date, currency, etc. +│ │ ├── validators.ts # Form validation +│ │ └── constants.ts # Field definitions, statuses +│ │ +│ ├── styles/ +│ │ └── index.css # Tailwind entry +│ │ +│ ├── App.tsx +│ ├── main.tsx +│ └── router.tsx # React Router config +│ +├── .env.example +├── package.json +├── tsconfig.json +├── vite.config.ts +├── tailwind.config.js +├── postcss.config.js +└── index.html +``` + +## Migration Steps + +### Phase 1: Setup Infrastructure +- [ ] Install dependencies (axios, react-router, zustand, @tanstack/react-query) +- [ ] Setup local Tailwind (remove CDN) +- [ ] Create API client with interceptors +- [ ] Add environment variables (.env.local with VITE_API_URL) + +### Phase 2: Create API Layer +- [ ] Create `src/api/client.ts` with axios instance +- [ ] Create `src/api/endpoints/documents.ts` matching backend API +- [ ] Create `src/api/endpoints/annotations.ts` +- [ ] Create `src/api/endpoints/training.ts` +- [ ] Add types matching backend schemas + +### Phase 3: Reorganize Components +- [ ] Move existing components to new structure +- [ ] Split large components (Dashboard > DocumentTable + DocumentFilters + DocumentRow) +- [ ] Extract reusable components (Badge, Button already done) +- [ ] Create layout components (TopNav, Sidebar) + +### Phase 4: Add Routing +- [ ] Install react-router-dom +- [ ] Create router.tsx with routes +- [ ] Update App.tsx to use RouterProvider +- [ ] Add navigation links + +### Phase 5: State Management +- [ ] Create custom hooks (useDocuments, useAnnotations) +- [ ] Use @tanstack/react-query for server state +- [ ] Add Zustand stores for UI state +- [ ] Replace mock data with API calls + +### Phase 6: Backend Integration +- [ ] Update CORS settings in backend +- [ ] Test all API endpoints +- [ ] Add error handling +- [ ] Add loading states + +## Dependencies to Add + +```json +{ + "dependencies": { + "react-router-dom": "^6.22.0", + "axios": "^1.6.7", + "zustand": "^4.5.0", + "@tanstack/react-query": "^5.20.0", + "date-fns": "^3.3.0", + "clsx": "^2.1.0" + }, + "devDependencies": { + "tailwindcss": "^3.4.1", + "autoprefixer": "^10.4.17", + "postcss": "^8.4.35" + } +} +``` + +## Configuration Files to Create + +### tailwind.config.js +```javascript +export default { + content: ['./index.html', './src/**/*.{js,ts,jsx,tsx}'], + theme: { + extend: { + colors: { + warm: { + bg: '#FAFAF8', + card: '#FFFFFF', + hover: '#F1F0ED', + selected: '#ECEAE6', + border: '#E6E4E1', + divider: '#D8D6D2', + text: { + primary: '#121212', + secondary: '#2A2A2A', + muted: '#6B6B6B', + disabled: '#9A9A9A', + }, + state: { + success: '#3E4A3A', + error: '#4A3A3A', + warning: '#4A4A3A', + info: '#3A3A3A', + } + } + } + } + } +} +``` + +### .env.example +```bash +VITE_API_URL=http://localhost:8000 +VITE_WS_URL=ws://localhost:8000/ws +``` + +## Type Generation from Backend + +Consider generating TypeScript types from Python Pydantic schemas: +- Option 1: Use `datamodel-code-generator` to convert schemas +- Option 2: Manually maintain types in `src/types/api.ts` +- Option 3: Use OpenAPI spec + openapi-typescript-codegen + +## Testing Strategy + +- Unit tests: Vitest for components +- Integration tests: React Testing Library +- E2E tests: Playwright (matching backend) + +## Performance Considerations + +- Code splitting by route +- Lazy load heavy components (AnnotationCanvas) +- Optimize re-renders with React.memo +- Use virtual scrolling for large tables +- Image lazy loading for document previews + +## Accessibility + +- Proper ARIA labels +- Keyboard navigation +- Focus management +- Color contrast compliance (already done with Warm Graphite theme) diff --git a/frontend/SETUP.md b/frontend/SETUP.md new file mode 100644 index 0000000..e674d4b --- /dev/null +++ b/frontend/SETUP.md @@ -0,0 +1,256 @@ +# Frontend Setup Guide + +## Quick Start + +### 1. Install Dependencies + +```bash +cd frontend +npm install +``` + +### 2. Configure Environment + +Copy `.env.example` to `.env.local` and update if needed: + +```bash +cp .env.example .env.local +``` + +Default configuration: +``` +VITE_API_URL=http://localhost:8000 +VITE_WS_URL=ws://localhost:8000/ws +``` + +### 3. Start Backend API + +Make sure the backend is running first: + +```bash +# From project root +wsl bash -c "source ~/miniconda3/etc/profile.d/conda.sh && conda activate invoice-py311 && python run_server.py" +``` + +Backend will be available at: http://localhost:8000 + +### 4. Start Frontend Dev Server + +```bash +cd frontend +npm run dev +``` + +Frontend will be available at: http://localhost:3000 + +## Project Structure + +``` +frontend/ +├── src/ +│ ├── api/ # API client layer +│ │ ├── client.ts # Axios instance with interceptors +│ │ ├── types.ts # API type definitions +│ │ └── endpoints/ +│ │ ├── documents.ts # Document API calls +│ │ ├── annotations.ts # Annotation API calls +│ │ └── training.ts # Training API calls +│ │ +│ ├── components/ # React components +│ │ └── Dashboard.tsx # Updated with real API integration +│ │ +│ ├── hooks/ # Custom React Hooks +│ │ ├── useDocuments.ts +│ │ ├── useDocumentDetail.ts +│ │ ├── useAnnotations.ts +│ │ └── useTraining.ts +│ │ +│ ├── styles/ +│ │ └── index.css # Tailwind CSS entry +│ │ +│ ├── App.tsx +│ └── main.tsx # App entry point with QueryClient +│ +├── components/ # Legacy components (to be migrated) +│ ├── Badge.tsx +│ ├── Button.tsx +│ ├── Layout.tsx +│ ├── DocumentDetail.tsx +│ ├── Training.tsx +│ ├── Models.tsx +│ └── UploadModal.tsx +│ +├── tailwind.config.js # Tailwind configuration +├── postcss.config.js +├── vite.config.ts +├── package.json +└── index.html +``` + +## Key Technologies + +- **React 19** - UI framework +- **TypeScript** - Type safety +- **Vite** - Build tool +- **Tailwind CSS** - Styling (Warm Graphite theme) +- **Axios** - HTTP client +- **@tanstack/react-query** - Server state management +- **lucide-react** - Icon library + +## API Integration + +### Authentication + +The app stores admin token in localStorage: + +```typescript +localStorage.setItem('admin_token', 'your-token') +``` + +All API requests automatically include the `X-Admin-Token` header. + +### Available Hooks + +#### useDocuments + +```typescript +const { + documents, + total, + isLoading, + uploadDocument, + deleteDocument, + triggerAutoLabel, +} = useDocuments({ status: 'labeled', limit: 20 }) +``` + +#### useDocumentDetail + +```typescript +const { document, annotations, isLoading } = useDocumentDetail(documentId) +``` + +#### useAnnotations + +```typescript +const { + createAnnotation, + updateAnnotation, + deleteAnnotation, + verifyAnnotation, + overrideAnnotation, +} = useAnnotations(documentId) +``` + +#### useTraining + +```typescript +const { + models, + isLoadingModels, + startTraining, + downloadModel, +} = useTraining() +``` + +## Features Implemented + +### Phase 1 (Completed) +- ✅ API client with axios interceptors +- ✅ Type-safe API endpoints +- ✅ React Query for server state +- ✅ Custom hooks for all APIs +- ✅ Dashboard with real data +- ✅ Local Tailwind CSS +- ✅ Environment configuration +- ✅ CORS configured in backend + +### Phase 2 (TODO) +- [ ] Update DocumentDetail to use useDocumentDetail +- [ ] Update Training page to use useTraining hooks +- [ ] Update Models page with real data +- [ ] Add UploadModal integration with API +- [ ] Add react-router for proper routing +- [ ] Add error boundary +- [ ] Add loading states +- [ ] Add toast notifications + +### Phase 3 (TODO) +- [ ] Annotation canvas with real data +- [ ] Batch upload functionality +- [ ] Auto-label progress polling +- [ ] Training job monitoring +- [ ] Model download functionality +- [ ] Search and filtering +- [ ] Pagination + +## Development Tips + +### Hot Module Replacement + +Vite supports HMR. Changes will reflect immediately without page reload. + +### API Debugging + +Check browser console for API requests: +- Network tab shows all requests/responses +- Axios interceptors log errors automatically + +### Type Safety + +TypeScript types in `src/api/types.ts` match backend Pydantic schemas. + +To regenerate types from backend: +```bash +# TODO: Add type generation script +``` + +### Backend API Documentation + +Visit http://localhost:8000/docs for interactive API documentation (Swagger UI). + +## Troubleshooting + +### CORS Errors + +If you see CORS errors: +1. Check backend is running at http://localhost:8000 +2. Verify CORS settings in `src/web/app.py` +3. Check `.env.local` has correct `VITE_API_URL` + +### Module Not Found + +If imports fail: +```bash +rm -rf node_modules package-lock.json +npm install +``` + +### Types Not Matching + +If API responses don't match types: +1. Check backend version is up-to-date +2. Verify types in `src/api/types.ts` +3. Check API response in Network tab + +## Next Steps + +1. Run `npm install` to install dependencies +2. Start backend server +3. Run `npm run dev` to start frontend +4. Open http://localhost:3000 +5. Create an admin token via backend API +6. Store token in localStorage via browser console: + ```javascript + localStorage.setItem('admin_token', 'your-token-here') + ``` +7. Refresh page to see authenticated API calls + +## Production Build + +```bash +npm run build +npm run preview # Preview production build +``` + +Build output will be in `dist/` directory. diff --git a/frontend/index.html b/frontend/index.html new file mode 100644 index 0000000..92626ae --- /dev/null +++ b/frontend/index.html @@ -0,0 +1,15 @@ + + + + + + Graphite Annotator - Invoice Field Extraction + + + + + +
+ + + \ No newline at end of file diff --git a/frontend/metadata.json b/frontend/metadata.json new file mode 100644 index 0000000..c2725fb --- /dev/null +++ b/frontend/metadata.json @@ -0,0 +1,5 @@ +{ + "name": "Graphite Annotator", + "description": "A professional, warm graphite themed document annotation and training tool for enterprise use cases.", + "requestFramePermissions": [] +} \ No newline at end of file diff --git a/frontend/package-lock.json b/frontend/package-lock.json new file mode 100644 index 0000000..46ab77d --- /dev/null +++ b/frontend/package-lock.json @@ -0,0 +1,3510 @@ +{ + "name": "graphite-annotator", + "version": "0.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "graphite-annotator", + "version": "0.0.0", + "dependencies": { + "@tanstack/react-query": "^5.20.0", + "axios": "^1.6.7", + "clsx": "^2.1.0", + "date-fns": "^3.3.0", + "lucide-react": "^0.563.0", + "react": "^19.2.3", + "react-dom": "^19.2.3", + "react-router-dom": "^6.22.0", + "recharts": "^3.7.0", + "zustand": "^4.5.0" + }, + "devDependencies": { + "@types/node": "^22.14.0", + "@vitejs/plugin-react": "^5.0.0", + "autoprefixer": "^10.4.17", + "postcss": "^8.4.35", + "tailwindcss": "^3.4.1", + "typescript": "~5.8.2", + "vite": "^6.2.0" + } + }, + "node_modules/@alloc/quick-lru": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/@alloc/quick-lru/-/quick-lru-5.2.0.tgz", + "integrity": "sha512-UrcABB+4bUrFABwbluTIBErXwvbsU/V7TZWfmbgJfbkwiBuziS9gxdODUyuiecfdGQ85jglMW6juS3+z5TsKLw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/@babel/code-frame": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.28.6.tgz", + "integrity": "sha512-JYgintcMjRiCvS8mMECzaEn+m3PfoQiyqukOMCCVQtoJGYJw8j/8LBJEiqkHLkfwCcs74E3pbAUFNg7d9VNJ+Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/helper-validator-identifier": "^7.28.5", + "js-tokens": "^4.0.0", + "picocolors": "^1.1.1" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/compat-data": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.28.6.tgz", + "integrity": "sha512-2lfu57JtzctfIrcGMz992hyLlByuzgIk58+hhGCxjKZ3rWI82NnVLjXcaTqkI2NvlcvOskZaiZ5kjUALo3Lpxg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/core": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/core/-/core-7.28.6.tgz", + "integrity": "sha512-H3mcG6ZDLTlYfaSNi0iOKkigqMFvkTKlGUYlD8GW7nNOYRrevuA46iTypPyv+06V3fEmvvazfntkBU34L0azAw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/code-frame": "^7.28.6", + "@babel/generator": "^7.28.6", + "@babel/helper-compilation-targets": "^7.28.6", + "@babel/helper-module-transforms": "^7.28.6", + "@babel/helpers": "^7.28.6", + "@babel/parser": "^7.28.6", + "@babel/template": "^7.28.6", + "@babel/traverse": "^7.28.6", + "@babel/types": "^7.28.6", + "@jridgewell/remapping": "^2.3.5", + "convert-source-map": "^2.0.0", + "debug": "^4.1.0", + "gensync": "^1.0.0-beta.2", + "json5": "^2.2.3", + "semver": "^6.3.1" + }, + "engines": { + "node": ">=6.9.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/babel" + } + }, + "node_modules/@babel/generator": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.28.6.tgz", + "integrity": "sha512-lOoVRwADj8hjf7al89tvQ2a1lf53Z+7tiXMgpZJL3maQPDxh0DgLMN62B2MKUOFcoodBHLMbDM6WAbKgNy5Suw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/parser": "^7.28.6", + "@babel/types": "^7.28.6", + "@jridgewell/gen-mapping": "^0.3.12", + "@jridgewell/trace-mapping": "^0.3.28", + "jsesc": "^3.0.2" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-compilation-targets": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/helper-compilation-targets/-/helper-compilation-targets-7.28.6.tgz", + "integrity": "sha512-JYtls3hqi15fcx5GaSNL7SCTJ2MNmjrkHXg4FSpOA/grxK8KwyZ5bubHsCq8FXCkua6xhuaaBit+3b7+VZRfcA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/compat-data": "^7.28.6", + "@babel/helper-validator-option": "^7.27.1", + "browserslist": "^4.24.0", + "lru-cache": "^5.1.1", + "semver": "^6.3.1" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-globals": { + "version": "7.28.0", + "resolved": "https://registry.npmjs.org/@babel/helper-globals/-/helper-globals-7.28.0.tgz", + "integrity": "sha512-+W6cISkXFa1jXsDEdYA8HeevQT/FULhxzR99pxphltZcVaugps53THCeiWA8SguxxpSp3gKPiuYfSWopkLQ4hw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-module-imports": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/helper-module-imports/-/helper-module-imports-7.28.6.tgz", + "integrity": "sha512-l5XkZK7r7wa9LucGw9LwZyyCUscb4x37JWTPz7swwFE/0FMQAGpiWUZn8u9DzkSBWEcK25jmvubfpw2dnAMdbw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/traverse": "^7.28.6", + "@babel/types": "^7.28.6" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-module-transforms": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/helper-module-transforms/-/helper-module-transforms-7.28.6.tgz", + "integrity": "sha512-67oXFAYr2cDLDVGLXTEABjdBJZ6drElUSI7WKp70NrpyISso3plG9SAGEF6y7zbha/wOzUByWWTJvEDVNIUGcA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/helper-module-imports": "^7.28.6", + "@babel/helper-validator-identifier": "^7.28.5", + "@babel/traverse": "^7.28.6" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" + } + }, + "node_modules/@babel/helper-plugin-utils": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/helper-plugin-utils/-/helper-plugin-utils-7.28.6.tgz", + "integrity": "sha512-S9gzZ/bz83GRysI7gAD4wPT/AI3uCnY+9xn+Mx/KPs2JwHJIz1W8PZkg2cqyt3RNOBM8ejcXhV6y8Og7ly/Dug==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-string-parser": { + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.27.1.tgz", + "integrity": "sha512-qMlSxKbpRlAridDExk92nSobyDdpPijUq2DW6oDnUqd0iOGxmQjyqhMIihI9+zv4LPyZdRje2cavWPbCbWm3eA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-validator-identifier": { + "version": "7.28.5", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.28.5.tgz", + "integrity": "sha512-qSs4ifwzKJSV39ucNjsvc6WVHs6b7S03sOh2OcHF9UHfVPqWWALUsNUVzhSBiItjRZoLHx7nIarVjqKVusUZ1Q==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-validator-option": { + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-option/-/helper-validator-option-7.27.1.tgz", + "integrity": "sha512-YvjJow9FxbhFFKDSuFnVCe2WxXk1zWc22fFePVNEaWJEu8IrZVlda6N0uHwzZrUM1il7NC9Mlp4MaJYbYd9JSg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helpers": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.28.6.tgz", + "integrity": "sha512-xOBvwq86HHdB7WUDTfKfT/Vuxh7gElQ+Sfti2Cy6yIWNW05P8iUslOVcZ4/sKbE+/jQaukQAdz/gf3724kYdqw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/template": "^7.28.6", + "@babel/types": "^7.28.6" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/parser": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.28.6.tgz", + "integrity": "sha512-TeR9zWR18BvbfPmGbLampPMW+uW1NZnJlRuuHso8i87QZNq2JRF9i6RgxRqtEq+wQGsS19NNTWr2duhnE49mfQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/types": "^7.28.6" + }, + "bin": { + "parser": "bin/babel-parser.js" + }, + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/@babel/plugin-transform-react-jsx-self": { + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-jsx-self/-/plugin-transform-react-jsx-self-7.27.1.tgz", + "integrity": "sha512-6UzkCs+ejGdZ5mFFC/OCUrv028ab2fp1znZmCZjAOBKiBK2jXD1O+BPSfX8X2qjJ75fZBMSnQn3Rq2mrBJK2mw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/helper-plugin-utils": "^7.27.1" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-react-jsx-source": { + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-jsx-source/-/plugin-transform-react-jsx-source-7.27.1.tgz", + "integrity": "sha512-zbwoTsBruTeKB9hSq73ha66iFeJHuaFkUbwvqElnygoNbj/jHRsSeokowZFN3CZ64IvEqcmmkVe89OPXc7ldAw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/helper-plugin-utils": "^7.27.1" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/template": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.28.6.tgz", + "integrity": "sha512-YA6Ma2KsCdGb+WC6UpBVFJGXL58MDA6oyONbjyF/+5sBgxY/dwkhLogbMT2GXXyU84/IhRw/2D1Os1B/giz+BQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/code-frame": "^7.28.6", + "@babel/parser": "^7.28.6", + "@babel/types": "^7.28.6" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/traverse": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.28.6.tgz", + "integrity": "sha512-fgWX62k02qtjqdSNTAGxmKYY/7FSL9WAS1o2Hu5+I5m9T0yxZzr4cnrfXQ/MX0rIifthCSs6FKTlzYbJcPtMNg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/code-frame": "^7.28.6", + "@babel/generator": "^7.28.6", + "@babel/helper-globals": "^7.28.0", + "@babel/parser": "^7.28.6", + "@babel/template": "^7.28.6", + "@babel/types": "^7.28.6", + "debug": "^4.3.1" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/types": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.28.6.tgz", + "integrity": "sha512-0ZrskXVEHSWIqZM/sQZ4EV3jZJXRkio/WCxaqKZP1g//CEWEPSfeZFcms4XeKBCHU0ZKnIkdJeU/kF+eRp5lBg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/helper-string-parser": "^7.27.1", + "@babel/helper-validator-identifier": "^7.28.5" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@esbuild/aix-ppc64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.25.12.tgz", + "integrity": "sha512-Hhmwd6CInZ3dwpuGTF8fJG6yoWmsToE+vYgD4nytZVxcu1ulHpUQRAB1UJ8+N1Am3Mz4+xOByoQoSZf4D+CpkA==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "aix" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.25.12.tgz", + "integrity": "sha512-VJ+sKvNA/GE7Ccacc9Cha7bpS8nyzVv0jdVgwNDaR4gDMC/2TTRc33Ip8qrNYUcpkOHUT5OZ0bUcNNVZQ9RLlg==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.25.12.tgz", + "integrity": "sha512-6AAmLG7zwD1Z159jCKPvAxZd4y/VTO0VkprYy+3N2FtJ8+BQWFXU+OxARIwA46c5tdD9SsKGZ/1ocqBS/gAKHg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.25.12.tgz", + "integrity": "sha512-5jbb+2hhDHx5phYR2By8GTWEzn6I9UqR11Kwf22iKbNpYrsmRB18aX/9ivc5cabcUiAT/wM+YIZ6SG9QO6a8kg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/darwin-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.25.12.tgz", + "integrity": "sha512-N3zl+lxHCifgIlcMUP5016ESkeQjLj/959RxxNYIthIg+CQHInujFuXeWbWMgnTo4cp5XVHqFPmpyu9J65C1Yg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/darwin-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.25.12.tgz", + "integrity": "sha512-HQ9ka4Kx21qHXwtlTUVbKJOAnmG1ipXhdWTmNXiPzPfWKpXqASVcWdnf2bnL73wgjNrFXAa3yYvBSd9pzfEIpA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.25.12.tgz", + "integrity": "sha512-gA0Bx759+7Jve03K1S0vkOu5Lg/85dou3EseOGUes8flVOGxbhDDh/iZaoek11Y8mtyKPGF3vP8XhnkDEAmzeg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.25.12.tgz", + "integrity": "sha512-TGbO26Yw2xsHzxtbVFGEXBFH0FRAP7gtcPE7P5yP7wGy7cXK2oO7RyOhL5NLiqTlBh47XhmIUXuGciXEqYFfBQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.25.12.tgz", + "integrity": "sha512-lPDGyC1JPDou8kGcywY0YILzWlhhnRjdof3UlcoqYmS9El818LLfJJc3PXXgZHrHCAKs/Z2SeZtDJr5MrkxtOw==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.25.12.tgz", + "integrity": "sha512-8bwX7a8FghIgrupcxb4aUmYDLp8pX06rGh5HqDT7bB+8Rdells6mHvrFHHW2JAOPZUbnjUpKTLg6ECyzvas2AQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ia32": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.25.12.tgz", + "integrity": "sha512-0y9KrdVnbMM2/vG8KfU0byhUN+EFCny9+8g202gYqSSVMonbsCfLjUO+rCci7pM0WBEtz+oK/PIwHkzxkyharA==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-loong64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.25.12.tgz", + "integrity": "sha512-h///Lr5a9rib/v1GGqXVGzjL4TMvVTv+s1DPoxQdz7l/AYv6LDSxdIwzxkrPW438oUXiDtwM10o9PmwS/6Z0Ng==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-mips64el": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.25.12.tgz", + "integrity": "sha512-iyRrM1Pzy9GFMDLsXn1iHUm18nhKnNMWscjmp4+hpafcZjrr2WbT//d20xaGljXDBYHqRcl8HnxbX6uaA/eGVw==", + "cpu": [ + "mips64el" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ppc64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.25.12.tgz", + "integrity": "sha512-9meM/lRXxMi5PSUqEXRCtVjEZBGwB7P/D4yT8UG/mwIdze2aV4Vo6U5gD3+RsoHXKkHCfSxZKzmDssVlRj1QQA==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-riscv64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.25.12.tgz", + "integrity": "sha512-Zr7KR4hgKUpWAwb1f3o5ygT04MzqVrGEGXGLnj15YQDJErYu/BGg+wmFlIDOdJp0PmB0lLvxFIOXZgFRrdjR0w==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-s390x": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.25.12.tgz", + "integrity": "sha512-MsKncOcgTNvdtiISc/jZs/Zf8d0cl/t3gYWX8J9ubBnVOwlk65UIEEvgBORTiljloIWnBzLs4qhzPkJcitIzIg==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.25.12.tgz", + "integrity": "sha512-uqZMTLr/zR/ed4jIGnwSLkaHmPjOjJvnm6TVVitAa08SLS9Z0VM8wIRx7gWbJB5/J54YuIMInDquWyYvQLZkgw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/netbsd-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-arm64/-/netbsd-arm64-0.25.12.tgz", + "integrity": "sha512-xXwcTq4GhRM7J9A8Gv5boanHhRa/Q9KLVmcyXHCTaM4wKfIpWkdXiMog/KsnxzJ0A1+nD+zoecuzqPmCRyBGjg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/netbsd-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.25.12.tgz", + "integrity": "sha512-Ld5pTlzPy3YwGec4OuHh1aCVCRvOXdH8DgRjfDy/oumVovmuSzWfnSJg+VtakB9Cm0gxNO9BzWkj6mtO1FMXkQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.25.12.tgz", + "integrity": "sha512-fF96T6KsBo/pkQI950FARU9apGNTSlZGsv1jZBAlcLL1MLjLNIWPBkj5NlSz8aAzYKg+eNqknrUJ24QBybeR5A==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.25.12.tgz", + "integrity": "sha512-MZyXUkZHjQxUvzK7rN8DJ3SRmrVrke8ZyRusHlP+kuwqTcfWLyqMOE3sScPPyeIXN/mDJIfGXvcMqCgYKekoQw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openharmony-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/openharmony-arm64/-/openharmony-arm64-0.25.12.tgz", + "integrity": "sha512-rm0YWsqUSRrjncSXGA7Zv78Nbnw4XL6/dzr20cyrQf7ZmRcsovpcRBdhD43Nuk3y7XIoW2OxMVvwuRvk9XdASg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openharmony" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/sunos-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.25.12.tgz", + "integrity": "sha512-3wGSCDyuTHQUzt0nV7bocDy72r2lI33QL3gkDNGkod22EsYl04sMf0qLb8luNKTOmgF/eDEDP5BFNwoBKH441w==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "sunos" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.25.12.tgz", + "integrity": "sha512-rMmLrur64A7+DKlnSuwqUdRKyd3UE7oPJZmnljqEptesKM8wx9J8gx5u0+9Pq0fQQW8vqeKebwNXdfOyP+8Bsg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-ia32": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.25.12.tgz", + "integrity": "sha512-HkqnmmBoCbCwxUKKNPBixiWDGCpQGVsrQfJoVGYLPT41XWF8lHuE5N6WhVia2n4o5QK5M4tYr21827fNhi4byQ==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.25.12.tgz", + "integrity": "sha512-alJC0uCZpTFrSL0CCDjcgleBXPnCrEAhTBILpeAp7M/OFgoqtAetfBzX0xM00MUsVVPpVjlPuMbREqnZCXaTnA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@jridgewell/gen-mapping": { + "version": "0.3.13", + "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.13.tgz", + "integrity": "sha512-2kkt/7niJ6MgEPxF0bYdQ6etZaA+fQvDcLKckhy1yIQOzaoKjBBjSj63/aLVjYE3qhRt5dvM+uUyfCg6UKCBbA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@jridgewell/sourcemap-codec": "^1.5.0", + "@jridgewell/trace-mapping": "^0.3.24" + } + }, + "node_modules/@jridgewell/remapping": { + "version": "2.3.5", + "resolved": "https://registry.npmjs.org/@jridgewell/remapping/-/remapping-2.3.5.tgz", + "integrity": "sha512-LI9u/+laYG4Ds1TDKSJW2YPrIlcVYOwi2fUC6xB43lueCjgxV4lffOCZCtYFiH6TNOX+tQKXx97T4IKHbhyHEQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@jridgewell/gen-mapping": "^0.3.5", + "@jridgewell/trace-mapping": "^0.3.24" + } + }, + "node_modules/@jridgewell/resolve-uri": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz", + "integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/@jridgewell/sourcemap-codec": { + "version": "1.5.5", + "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz", + "integrity": "sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og==", + "dev": true, + "license": "MIT" + }, + "node_modules/@jridgewell/trace-mapping": { + "version": "0.3.31", + "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.31.tgz", + "integrity": "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@jridgewell/resolve-uri": "^3.1.0", + "@jridgewell/sourcemap-codec": "^1.4.14" + } + }, + "node_modules/@nodelib/fs.scandir": { + "version": "2.1.5", + "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz", + "integrity": "sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==", + "dev": true, + "license": "MIT", + "dependencies": { + "@nodelib/fs.stat": "2.0.5", + "run-parallel": "^1.1.9" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/@nodelib/fs.stat": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/@nodelib/fs.stat/-/fs.stat-2.0.5.tgz", + "integrity": "sha512-RkhPPp2zrqDAQA/2jNhnztcPAlv64XdhIp7a7454A5ovI7Bukxgt7MX7udwAu3zg1DcpPU0rz3VV1SeaqvY4+A==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 8" + } + }, + "node_modules/@nodelib/fs.walk": { + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/@nodelib/fs.walk/-/fs.walk-1.2.8.tgz", + "integrity": "sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@nodelib/fs.scandir": "2.1.5", + "fastq": "^1.6.0" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/@reduxjs/toolkit": { + "version": "2.11.2", + "resolved": "https://registry.npmjs.org/@reduxjs/toolkit/-/toolkit-2.11.2.tgz", + "integrity": "sha512-Kd6kAHTA6/nUpp8mySPqj3en3dm0tdMIgbttnQ1xFMVpufoj+ADi8pXLBsd4xzTRHQa7t/Jv8W5UnCuW4kuWMQ==", + "license": "MIT", + "dependencies": { + "@standard-schema/spec": "^1.0.0", + "@standard-schema/utils": "^0.3.0", + "immer": "^11.0.0", + "redux": "^5.0.1", + "redux-thunk": "^3.1.0", + "reselect": "^5.1.0" + }, + "peerDependencies": { + "react": "^16.9.0 || ^17.0.0 || ^18 || ^19", + "react-redux": "^7.2.1 || ^8.1.3 || ^9.0.0" + }, + "peerDependenciesMeta": { + "react": { + "optional": true + }, + "react-redux": { + "optional": true + } + } + }, + "node_modules/@reduxjs/toolkit/node_modules/immer": { + "version": "11.1.3", + "resolved": "https://registry.npmjs.org/immer/-/immer-11.1.3.tgz", + "integrity": "sha512-6jQTc5z0KJFtr1UgFpIL3N9XSC3saRaI9PwWtzM2pSqkNGtiNkYY2OSwkOGDK2XcTRcLb1pi/aNkKZz0nxVH4Q==", + "license": "MIT", + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/immer" + } + }, + "node_modules/@remix-run/router": { + "version": "1.23.2", + "resolved": "https://registry.npmjs.org/@remix-run/router/-/router-1.23.2.tgz", + "integrity": "sha512-Ic6m2U/rMjTkhERIa/0ZtXJP17QUi2CbWE7cqx4J58M8aA3QTfW+2UlQ4psvTX9IO1RfNVhK3pcpdjej7L+t2w==", + "license": "MIT", + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/@rolldown/pluginutils": { + "version": "1.0.0-beta.53", + "resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.53.tgz", + "integrity": "sha512-vENRlFU4YbrwVqNDZ7fLvy+JR1CRkyr01jhSiDpE1u6py3OMzQfztQU2jxykW3ALNxO4kSlqIDeYyD0Y9RcQeQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/@rollup/rollup-android-arm-eabi": { + "version": "4.56.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.56.0.tgz", + "integrity": "sha512-LNKIPA5k8PF1+jAFomGe3qN3bbIgJe/IlpDBwuVjrDKrJhVWywgnJvflMt/zkbVNLFtF1+94SljYQS6e99klnw==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-android-arm64": { + "version": "4.56.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.56.0.tgz", + "integrity": "sha512-lfbVUbelYqXlYiU/HApNMJzT1E87UPGvzveGg2h0ktUNlOCxKlWuJ9jtfvs1sKHdwU4fzY7Pl8sAl49/XaEk6Q==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-darwin-arm64": { + "version": "4.56.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.56.0.tgz", + "integrity": "sha512-EgxD1ocWfhoD6xSOeEEwyE7tDvwTgZc8Bss7wCWe+uc7wO8G34HHCUH+Q6cHqJubxIAnQzAsyUsClt0yFLu06w==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-darwin-x64": { + "version": "4.56.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.56.0.tgz", + "integrity": "sha512-1vXe1vcMOssb/hOF8iv52A7feWW2xnu+c8BV4t1F//m9QVLTfNVpEdja5ia762j/UEJe2Z1jAmEqZAK42tVW3g==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-freebsd-arm64": { + "version": "4.56.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.56.0.tgz", + "integrity": "sha512-bof7fbIlvqsyv/DtaXSck4VYQ9lPtoWNFCB/JY4snlFuJREXfZnm+Ej6yaCHfQvofJDXLDMTVxWscVSuQvVWUQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/@rollup/rollup-freebsd-x64": { + "version": "4.56.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.56.0.tgz", + "integrity": "sha512-KNa6lYHloW+7lTEkYGa37fpvPq+NKG/EHKM8+G/g9WDU7ls4sMqbVRV78J6LdNuVaeeK5WB9/9VAFbKxcbXKYg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/@rollup/rollup-linux-arm-gnueabihf": { + "version": "4.56.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.56.0.tgz", + "integrity": "sha512-E8jKK87uOvLrrLN28jnAAAChNq5LeCd2mGgZF+fGF5D507WlG/Noct3lP/QzQ6MrqJ5BCKNwI9ipADB6jyiq2A==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm-musleabihf": { + "version": "4.56.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.56.0.tgz", + "integrity": "sha512-jQosa5FMYF5Z6prEpTCCmzCXz6eKr/tCBssSmQGEeozA9tkRUty/5Vx06ibaOP9RCrW1Pvb8yp3gvZhHwTDsJw==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-gnu": { + "version": "4.56.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.56.0.tgz", + "integrity": "sha512-uQVoKkrC1KGEV6udrdVahASIsaF8h7iLG0U0W+Xn14ucFwi6uS539PsAr24IEF9/FoDtzMeeJXJIBo5RkbNWvQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-musl": { + "version": "4.56.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.56.0.tgz", + "integrity": "sha512-vLZ1yJKLxhQLFKTs42RwTwa6zkGln+bnXc8ueFGMYmBTLfNu58sl5/eXyxRa2RarTkJbXl8TKPgfS6V5ijNqEA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-loong64-gnu": { + "version": "4.56.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-gnu/-/rollup-linux-loong64-gnu-4.56.0.tgz", + "integrity": "sha512-FWfHOCub564kSE3xJQLLIC/hbKqHSVxy8vY75/YHHzWvbJL7aYJkdgwD/xGfUlL5UV2SB7otapLrcCj2xnF1dg==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-loong64-musl": { + "version": "4.56.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-musl/-/rollup-linux-loong64-musl-4.56.0.tgz", + "integrity": "sha512-z1EkujxIh7nbrKL1lmIpqFTc/sr0u8Uk0zK/qIEFldbt6EDKWFk/pxFq3gYj4Bjn3aa9eEhYRlL3H8ZbPT1xvA==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-ppc64-gnu": { + "version": "4.56.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-gnu/-/rollup-linux-ppc64-gnu-4.56.0.tgz", + "integrity": "sha512-iNFTluqgdoQC7AIE8Q34R3AuPrJGJirj5wMUErxj22deOcY7XwZRaqYmB6ZKFHoVGqRcRd0mqO+845jAibKCkw==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-ppc64-musl": { + "version": "4.56.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-musl/-/rollup-linux-ppc64-musl-4.56.0.tgz", + "integrity": "sha512-MtMeFVlD2LIKjp2sE2xM2slq3Zxf9zwVuw0jemsxvh1QOpHSsSzfNOTH9uYW9i1MXFxUSMmLpeVeUzoNOKBaWg==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-gnu": { + "version": "4.56.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.56.0.tgz", + "integrity": "sha512-in+v6wiHdzzVhYKXIk5U74dEZHdKN9KH0Q4ANHOTvyXPG41bajYRsy7a8TPKbYPl34hU7PP7hMVHRvv/5aCSew==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-musl": { + "version": "4.56.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.56.0.tgz", + "integrity": "sha512-yni2raKHB8m9NQpI9fPVwN754mn6dHQSbDTwxdr9SE0ks38DTjLMMBjrwvB5+mXrX+C0npX0CVeCUcvvvD8CNQ==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-s390x-gnu": { + "version": "4.56.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.56.0.tgz", + "integrity": "sha512-zhLLJx9nQPu7wezbxt2ut+CI4YlXi68ndEve16tPc/iwoylWS9B3FxpLS2PkmfYgDQtosah07Mj9E0khc3Y+vQ==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-gnu": { + "version": "4.56.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.56.0.tgz", + "integrity": "sha512-MVC6UDp16ZSH7x4rtuJPAEoE1RwS8N4oK9DLHy3FTEdFoUTCFVzMfJl/BVJ330C+hx8FfprA5Wqx4FhZXkj2Kw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-musl": { + "version": "4.56.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.56.0.tgz", + "integrity": "sha512-ZhGH1eA4Qv0lxaV00azCIS1ChedK0V32952Md3FtnxSqZTBTd6tgil4nZT5cU8B+SIw3PFYkvyR4FKo2oyZIHA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-openbsd-x64": { + "version": "4.56.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-openbsd-x64/-/rollup-openbsd-x64-4.56.0.tgz", + "integrity": "sha512-O16XcmyDeFI9879pEcmtWvD/2nyxR9mF7Gs44lf1vGGx8Vg2DRNx11aVXBEqOQhWb92WN4z7fW/q4+2NYzCbBA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ] + }, + "node_modules/@rollup/rollup-openharmony-arm64": { + "version": "4.56.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-openharmony-arm64/-/rollup-openharmony-arm64-4.56.0.tgz", + "integrity": "sha512-LhN/Reh+7F3RCgQIRbgw8ZMwUwyqJM+8pXNT6IIJAqm2IdKkzpCh/V9EdgOMBKuebIrzswqy4ATlrDgiOwbRcQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openharmony" + ] + }, + "node_modules/@rollup/rollup-win32-arm64-msvc": { + "version": "4.56.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.56.0.tgz", + "integrity": "sha512-kbFsOObXp3LBULg1d3JIUQMa9Kv4UitDmpS+k0tinPBz3watcUiV2/LUDMMucA6pZO3WGE27P7DsfaN54l9ing==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-ia32-msvc": { + "version": "4.56.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.56.0.tgz", + "integrity": "sha512-vSSgny54D6P4vf2izbtFm/TcWYedw7f8eBrOiGGecyHyQB9q4Kqentjaj8hToe+995nob/Wv48pDqL5a62EWtg==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-x64-gnu": { + "version": "4.56.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-gnu/-/rollup-win32-x64-gnu-4.56.0.tgz", + "integrity": "sha512-FeCnkPCTHQJFbiGG49KjV5YGW/8b9rrXAM2Mz2kiIoktq2qsJxRD5giEMEOD2lPdgs72upzefaUvS+nc8E3UzQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-x64-msvc": { + "version": "4.56.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.56.0.tgz", + "integrity": "sha512-H8AE9Ur/t0+1VXujj90w0HrSOuv0Nq9r1vSZF2t5km20NTfosQsGGUXDaKdQZzwuLts7IyL1fYT4hM95TI9c4g==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@standard-schema/spec": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@standard-schema/spec/-/spec-1.1.0.tgz", + "integrity": "sha512-l2aFy5jALhniG5HgqrD6jXLi/rUWrKvqN/qJx6yoJsgKhblVd+iqqU4RCXavm/jPityDo5TCvKMnpjKnOriy0w==", + "license": "MIT" + }, + "node_modules/@standard-schema/utils": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/@standard-schema/utils/-/utils-0.3.0.tgz", + "integrity": "sha512-e7Mew686owMaPJVNNLs55PUvgz371nKgwsc4vxE49zsODpJEnxgxRo2y/OKrqueavXgZNMDVj3DdHFlaSAeU8g==", + "license": "MIT" + }, + "node_modules/@tanstack/query-core": { + "version": "5.90.20", + "resolved": "https://registry.npmjs.org/@tanstack/query-core/-/query-core-5.90.20.tgz", + "integrity": "sha512-OMD2HLpNouXEfZJWcKeVKUgQ5n+n3A2JFmBaScpNDUqSrQSjiveC7dKMe53uJUg1nDG16ttFPz2xfilz6i2uVg==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/tannerlinsley" + } + }, + "node_modules/@tanstack/react-query": { + "version": "5.90.20", + "resolved": "https://registry.npmjs.org/@tanstack/react-query/-/react-query-5.90.20.tgz", + "integrity": "sha512-vXBxa+qeyveVO7OA0jX1z+DeyCA4JKnThKv411jd5SORpBKgkcVnYKCiBgECvADvniBX7tobwBmg01qq9JmMJw==", + "license": "MIT", + "dependencies": { + "@tanstack/query-core": "5.90.20" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/tannerlinsley" + }, + "peerDependencies": { + "react": "^18 || ^19" + } + }, + "node_modules/@types/babel__core": { + "version": "7.20.5", + "resolved": "https://registry.npmjs.org/@types/babel__core/-/babel__core-7.20.5.tgz", + "integrity": "sha512-qoQprZvz5wQFJwMDqeseRXWv3rqMvhgpbXFfVyWhbx9X47POIA6i/+dXefEmZKoAgOaTdaIgNSMqMIU61yRyzA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/parser": "^7.20.7", + "@babel/types": "^7.20.7", + "@types/babel__generator": "*", + "@types/babel__template": "*", + "@types/babel__traverse": "*" + } + }, + "node_modules/@types/babel__generator": { + "version": "7.27.0", + "resolved": "https://registry.npmjs.org/@types/babel__generator/-/babel__generator-7.27.0.tgz", + "integrity": "sha512-ufFd2Xi92OAVPYsy+P4n7/U7e68fex0+Ee8gSG9KX7eo084CWiQ4sdxktvdl0bOPupXtVJPY19zk6EwWqUQ8lg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/types": "^7.0.0" + } + }, + "node_modules/@types/babel__template": { + "version": "7.4.4", + "resolved": "https://registry.npmjs.org/@types/babel__template/-/babel__template-7.4.4.tgz", + "integrity": "sha512-h/NUaSyG5EyxBIp8YRxo4RMe2/qQgvyowRwVMzhYhBCONbW8PUsg4lkFMrhgZhUe5z3L3MiLDuvyJ/CaPa2A8A==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/parser": "^7.1.0", + "@babel/types": "^7.0.0" + } + }, + "node_modules/@types/babel__traverse": { + "version": "7.28.0", + "resolved": "https://registry.npmjs.org/@types/babel__traverse/-/babel__traverse-7.28.0.tgz", + "integrity": "sha512-8PvcXf70gTDZBgt9ptxJ8elBeBjcLOAcOtoO/mPJjtji1+CdGbHgm77om1GrsPxsiE+uXIpNSK64UYaIwQXd4Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/types": "^7.28.2" + } + }, + "node_modules/@types/d3-array": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/@types/d3-array/-/d3-array-3.2.2.tgz", + "integrity": "sha512-hOLWVbm7uRza0BYXpIIW5pxfrKe0W+D5lrFiAEYR+pb6w3N2SwSMaJbXdUfSEv+dT4MfHBLtn5js0LAWaO6otw==", + "license": "MIT" + }, + "node_modules/@types/d3-color": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/@types/d3-color/-/d3-color-3.1.3.tgz", + "integrity": "sha512-iO90scth9WAbmgv7ogoq57O9YpKmFBbmoEoCHDB2xMBY0+/KVrqAaCDyCE16dUspeOvIxFFRI+0sEtqDqy2b4A==", + "license": "MIT" + }, + "node_modules/@types/d3-ease": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/@types/d3-ease/-/d3-ease-3.0.2.tgz", + "integrity": "sha512-NcV1JjO5oDzoK26oMzbILE6HW7uVXOHLQvHshBUW4UMdZGfiY6v5BeQwh9a9tCzv+CeefZQHJt5SRgK154RtiA==", + "license": "MIT" + }, + "node_modules/@types/d3-interpolate": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/@types/d3-interpolate/-/d3-interpolate-3.0.4.tgz", + "integrity": "sha512-mgLPETlrpVV1YRJIglr4Ez47g7Yxjl1lj7YKsiMCb27VJH9W8NVM6Bb9d8kkpG/uAQS5AmbA48q2IAolKKo1MA==", + "license": "MIT", + "dependencies": { + "@types/d3-color": "*" + } + }, + "node_modules/@types/d3-path": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/@types/d3-path/-/d3-path-3.1.1.tgz", + "integrity": "sha512-VMZBYyQvbGmWyWVea0EHs/BwLgxc+MKi1zLDCONksozI4YJMcTt8ZEuIR4Sb1MMTE8MMW49v0IwI5+b7RmfWlg==", + "license": "MIT" + }, + "node_modules/@types/d3-scale": { + "version": "4.0.9", + "resolved": "https://registry.npmjs.org/@types/d3-scale/-/d3-scale-4.0.9.tgz", + "integrity": "sha512-dLmtwB8zkAeO/juAMfnV+sItKjlsw2lKdZVVy6LRr0cBmegxSABiLEpGVmSJJ8O08i4+sGR6qQtb6WtuwJdvVw==", + "license": "MIT", + "dependencies": { + "@types/d3-time": "*" + } + }, + "node_modules/@types/d3-shape": { + "version": "3.1.8", + "resolved": "https://registry.npmjs.org/@types/d3-shape/-/d3-shape-3.1.8.tgz", + "integrity": "sha512-lae0iWfcDeR7qt7rA88BNiqdvPS5pFVPpo5OfjElwNaT2yyekbM0C9vK+yqBqEmHr6lDkRnYNoTBYlAgJa7a4w==", + "license": "MIT", + "dependencies": { + "@types/d3-path": "*" + } + }, + "node_modules/@types/d3-time": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/@types/d3-time/-/d3-time-3.0.4.tgz", + "integrity": "sha512-yuzZug1nkAAaBlBBikKZTgzCeA+k1uy4ZFwWANOfKw5z5LRhV0gNA7gNkKm7HoK+HRN0wX3EkxGk0fpbWhmB7g==", + "license": "MIT" + }, + "node_modules/@types/d3-timer": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/@types/d3-timer/-/d3-timer-3.0.2.tgz", + "integrity": "sha512-Ps3T8E8dZDam6fUyNiMkekK3XUsaUEik+idO9/YjPtfj2qruF8tFBXS7XhtE4iIXBLxhmLjP3SXpLhVf21I9Lw==", + "license": "MIT" + }, + "node_modules/@types/estree": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz", + "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/node": { + "version": "22.19.7", + "resolved": "https://registry.npmjs.org/@types/node/-/node-22.19.7.tgz", + "integrity": "sha512-MciR4AKGHWl7xwxkBa6xUGxQJ4VBOmPTF7sL+iGzuahOFaO0jHCsuEfS80pan1ef4gWId1oWOweIhrDEYLuaOw==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~6.21.0" + } + }, + "node_modules/@types/use-sync-external-store": { + "version": "0.0.6", + "resolved": "https://registry.npmjs.org/@types/use-sync-external-store/-/use-sync-external-store-0.0.6.tgz", + "integrity": "sha512-zFDAD+tlpf2r4asuHEj0XH6pY6i0g5NeAHPn+15wk3BV6JA69eERFXC1gyGThDkVa1zCyKr5jox1+2LbV/AMLg==", + "license": "MIT" + }, + "node_modules/@vitejs/plugin-react": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/@vitejs/plugin-react/-/plugin-react-5.1.2.tgz", + "integrity": "sha512-EcA07pHJouywpzsoTUqNh5NwGayl2PPVEJKUSinGGSxFGYn+shYbqMGBg6FXDqgXum9Ou/ecb+411ssw8HImJQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/core": "^7.28.5", + "@babel/plugin-transform-react-jsx-self": "^7.27.1", + "@babel/plugin-transform-react-jsx-source": "^7.27.1", + "@rolldown/pluginutils": "1.0.0-beta.53", + "@types/babel__core": "^7.20.5", + "react-refresh": "^0.18.0" + }, + "engines": { + "node": "^20.19.0 || >=22.12.0" + }, + "peerDependencies": { + "vite": "^4.2.0 || ^5.0.0 || ^6.0.0 || ^7.0.0" + } + }, + "node_modules/any-promise": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/any-promise/-/any-promise-1.3.0.tgz", + "integrity": "sha512-7UvmKalWRt1wgjL1RrGxoSJW/0QZFIegpeGvZG9kjp8vrRu55XTHbwnqq2GpXm9uLbcuhxm3IqX9OB4MZR1b2A==", + "dev": true, + "license": "MIT" + }, + "node_modules/anymatch": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-3.1.3.tgz", + "integrity": "sha512-KMReFUr0B4t+D+OBkjR3KYqvocp2XaSzO55UcB6mgQMd3KbcE+mWTyvVV7D/zsdEbNnV6acZUutkiHQXvTr1Rw==", + "dev": true, + "license": "ISC", + "dependencies": { + "normalize-path": "^3.0.0", + "picomatch": "^2.0.4" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/arg": { + "version": "5.0.2", + "resolved": "https://registry.npmjs.org/arg/-/arg-5.0.2.tgz", + "integrity": "sha512-PYjyFOLKQ9y57JvQ6QLo8dAgNqswh8M1RMJYdQduT6xbWSgK36P/Z/v+p888pM69jMMfS8Xd8F6I1kQ/I9HUGg==", + "dev": true, + "license": "MIT" + }, + "node_modules/asynckit": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", + "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==", + "license": "MIT" + }, + "node_modules/autoprefixer": { + "version": "10.4.23", + "resolved": "https://registry.npmjs.org/autoprefixer/-/autoprefixer-10.4.23.tgz", + "integrity": "sha512-YYTXSFulfwytnjAPlw8QHncHJmlvFKtczb8InXaAx9Q0LbfDnfEYDE55omerIJKihhmU61Ft+cAOSzQVaBUmeA==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/autoprefixer" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "browserslist": "^4.28.1", + "caniuse-lite": "^1.0.30001760", + "fraction.js": "^5.3.4", + "picocolors": "^1.1.1", + "postcss-value-parser": "^4.2.0" + }, + "bin": { + "autoprefixer": "bin/autoprefixer" + }, + "engines": { + "node": "^10 || ^12 || >=14" + }, + "peerDependencies": { + "postcss": "^8.1.0" + } + }, + "node_modules/axios": { + "version": "1.13.3", + "resolved": "https://registry.npmjs.org/axios/-/axios-1.13.3.tgz", + "integrity": "sha512-ERT8kdX7DZjtUm7IitEyV7InTHAF42iJuMArIiDIV5YtPanJkgw4hw5Dyg9fh0mihdWNn1GKaeIWErfe56UQ1g==", + "license": "MIT", + "dependencies": { + "follow-redirects": "^1.15.6", + "form-data": "^4.0.4", + "proxy-from-env": "^1.1.0" + } + }, + "node_modules/baseline-browser-mapping": { + "version": "2.9.18", + "resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.9.18.tgz", + "integrity": "sha512-e23vBV1ZLfjb9apvfPk4rHVu2ry6RIr2Wfs+O324okSidrX7pTAnEJPCh/O5BtRlr7QtZI7ktOP3vsqr7Z5XoA==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "baseline-browser-mapping": "dist/cli.js" + } + }, + "node_modules/binary-extensions": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-2.3.0.tgz", + "integrity": "sha512-Ceh+7ox5qe7LJuLHoY0feh3pHuUDHAcRUeyL2VYghZwfpkNIy/+8Ocg0a3UuSoYzavmylwuLWQOf3hl0jjMMIw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/braces": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz", + "integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==", + "dev": true, + "license": "MIT", + "dependencies": { + "fill-range": "^7.1.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/browserslist": { + "version": "4.28.1", + "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.28.1.tgz", + "integrity": "sha512-ZC5Bd0LgJXgwGqUknZY/vkUQ04r8NXnJZ3yYi4vDmSiZmC/pdSN0NbNRPxZpbtO4uAfDUAFffO8IZoM3Gj8IkA==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/browserslist" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/browserslist" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "baseline-browser-mapping": "^2.9.0", + "caniuse-lite": "^1.0.30001759", + "electron-to-chromium": "^1.5.263", + "node-releases": "^2.0.27", + "update-browserslist-db": "^1.2.0" + }, + "bin": { + "browserslist": "cli.js" + }, + "engines": { + "node": "^6 || ^7 || ^8 || ^9 || ^10 || ^11 || ^12 || >=13.7" + } + }, + "node_modules/call-bind-apply-helpers": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz", + "integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "function-bind": "^1.1.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/camelcase-css": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/camelcase-css/-/camelcase-css-2.0.1.tgz", + "integrity": "sha512-QOSvevhslijgYwRx6Rv7zKdMF8lbRmx+uQGx2+vDc+KI/eBnsy9kit5aj23AgGu3pa4t9AgwbnXWqS+iOY+2aA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 6" + } + }, + "node_modules/caniuse-lite": { + "version": "1.0.30001766", + "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001766.tgz", + "integrity": "sha512-4C0lfJ0/YPjJQHagaE9x2Elb69CIqEPZeG0anQt9SIvIoOH4a4uaRl73IavyO+0qZh6MDLH//DrXThEYKHkmYA==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/browserslist" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/caniuse-lite" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "CC-BY-4.0" + }, + "node_modules/chokidar": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-3.6.0.tgz", + "integrity": "sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==", + "dev": true, + "license": "MIT", + "dependencies": { + "anymatch": "~3.1.2", + "braces": "~3.0.2", + "glob-parent": "~5.1.2", + "is-binary-path": "~2.1.0", + "is-glob": "~4.0.1", + "normalize-path": "~3.0.0", + "readdirp": "~3.6.0" + }, + "engines": { + "node": ">= 8.10.0" + }, + "funding": { + "url": "https://paulmillr.com/funding/" + }, + "optionalDependencies": { + "fsevents": "~2.3.2" + } + }, + "node_modules/chokidar/node_modules/glob-parent": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz", + "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==", + "dev": true, + "license": "ISC", + "dependencies": { + "is-glob": "^4.0.1" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/clsx": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/clsx/-/clsx-2.1.1.tgz", + "integrity": "sha512-eYm0QWBtUrBWZWG0d386OGAw16Z995PiOVo2B7bjWSbHedGl5e0ZWaq65kOGgUSNesEIDkB9ISbTg/JK9dhCZA==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/combined-stream": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", + "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", + "license": "MIT", + "dependencies": { + "delayed-stream": "~1.0.0" + }, + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/commander": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/commander/-/commander-4.1.1.tgz", + "integrity": "sha512-NOKm8xhkzAjzFx8B2v5OAHT+u5pRQc2UCa2Vq9jYL/31o2wi9mxBA7LIFs3sV5VSC49z6pEhfbMULvShKj26WA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 6" + } + }, + "node_modules/convert-source-map": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/convert-source-map/-/convert-source-map-2.0.0.tgz", + "integrity": "sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg==", + "dev": true, + "license": "MIT" + }, + "node_modules/cssesc": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/cssesc/-/cssesc-3.0.0.tgz", + "integrity": "sha512-/Tb/JcjK111nNScGob5MNtsntNM1aCNUDipB/TkwZFhyDrrE47SOx/18wF2bbjgc3ZzCSKW1T5nt5EbFoAz/Vg==", + "dev": true, + "license": "MIT", + "bin": { + "cssesc": "bin/cssesc" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/d3-array": { + "version": "3.2.4", + "resolved": "https://registry.npmjs.org/d3-array/-/d3-array-3.2.4.tgz", + "integrity": "sha512-tdQAmyA18i4J7wprpYq8ClcxZy3SC31QMeByyCFyRt7BVHdREQZ5lpzoe5mFEYZUWe+oq8HBvk9JjpibyEV4Jg==", + "license": "ISC", + "dependencies": { + "internmap": "1 - 2" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-color": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/d3-color/-/d3-color-3.1.0.tgz", + "integrity": "sha512-zg/chbXyeBtMQ1LbD/WSoW2DpC3I0mpmPdW+ynRTj/x2DAWYrIY7qeZIHidozwV24m4iavr15lNwIwLxRmOxhA==", + "license": "ISC", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-ease": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-ease/-/d3-ease-3.0.1.tgz", + "integrity": "sha512-wR/XK3D3XcLIZwpbvQwQ5fK+8Ykds1ip7A2Txe0yxncXSdq1L9skcG7blcedkOX+ZcgxGAmLX1FrRGbADwzi0w==", + "license": "BSD-3-Clause", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-format": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/d3-format/-/d3-format-3.1.2.tgz", + "integrity": "sha512-AJDdYOdnyRDV5b6ArilzCPPwc1ejkHcoyFarqlPqT7zRYjhavcT3uSrqcMvsgh2CgoPbK3RCwyHaVyxYcP2Arg==", + "license": "ISC", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-interpolate": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-interpolate/-/d3-interpolate-3.0.1.tgz", + "integrity": "sha512-3bYs1rOD33uo8aqJfKP3JWPAibgw8Zm2+L9vBKEHJ2Rg+viTR7o5Mmv5mZcieN+FRYaAOWX5SJATX6k1PWz72g==", + "license": "ISC", + "dependencies": { + "d3-color": "1 - 3" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-path": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/d3-path/-/d3-path-3.1.0.tgz", + "integrity": "sha512-p3KP5HCf/bvjBSSKuXid6Zqijx7wIfNW+J/maPs+iwR35at5JCbLUT0LzF1cnjbCHWhqzQTIN2Jpe8pRebIEFQ==", + "license": "ISC", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-scale": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/d3-scale/-/d3-scale-4.0.2.tgz", + "integrity": "sha512-GZW464g1SH7ag3Y7hXjf8RoUuAFIqklOAq3MRl4OaWabTFJY9PN/E1YklhXLh+OQ3fM9yS2nOkCoS+WLZ6kvxQ==", + "license": "ISC", + "dependencies": { + "d3-array": "2.10.0 - 3", + "d3-format": "1 - 3", + "d3-interpolate": "1.2.0 - 3", + "d3-time": "2.1.1 - 3", + "d3-time-format": "2 - 4" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-shape": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/d3-shape/-/d3-shape-3.2.0.tgz", + "integrity": "sha512-SaLBuwGm3MOViRq2ABk3eLoxwZELpH6zhl3FbAoJ7Vm1gofKx6El1Ib5z23NUEhF9AsGl7y+dzLe5Cw2AArGTA==", + "license": "ISC", + "dependencies": { + "d3-path": "^3.1.0" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-time": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/d3-time/-/d3-time-3.1.0.tgz", + "integrity": "sha512-VqKjzBLejbSMT4IgbmVgDjpkYrNWUYJnbCGo874u7MMKIWsILRX+OpX/gTk8MqjpT1A/c6HY2dCA77ZN0lkQ2Q==", + "license": "ISC", + "dependencies": { + "d3-array": "2 - 3" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-time-format": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/d3-time-format/-/d3-time-format-4.1.0.tgz", + "integrity": "sha512-dJxPBlzC7NugB2PDLwo9Q8JiTR3M3e4/XANkreKSUxF8vvXKqm1Yfq4Q5dl8budlunRVlUUaDUgFt7eA8D6NLg==", + "license": "ISC", + "dependencies": { + "d3-time": "1 - 3" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-timer": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-timer/-/d3-timer-3.0.1.tgz", + "integrity": "sha512-ndfJ/JxxMd3nw31uyKoY2naivF+r29V+Lc0svZxe1JvvIRmi8hUsrMvdOwgS1o6uBHmiz91geQ0ylPP0aj1VUA==", + "license": "ISC", + "engines": { + "node": ">=12" + } + }, + "node_modules/date-fns": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/date-fns/-/date-fns-3.6.0.tgz", + "integrity": "sha512-fRHTG8g/Gif+kSh50gaGEdToemgfj74aRX3swtiouboip5JDLAyDE9F11nHMIcvOaXeOC6D7SpNhi7uFyB7Uww==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/kossnocorp" + } + }, + "node_modules/debug": { + "version": "4.4.3", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz", + "integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==", + "dev": true, + "license": "MIT", + "dependencies": { + "ms": "^2.1.3" + }, + "engines": { + "node": ">=6.0" + }, + "peerDependenciesMeta": { + "supports-color": { + "optional": true + } + } + }, + "node_modules/decimal.js-light": { + "version": "2.5.1", + "resolved": "https://registry.npmjs.org/decimal.js-light/-/decimal.js-light-2.5.1.tgz", + "integrity": "sha512-qIMFpTMZmny+MMIitAB6D7iVPEorVw6YQRWkvarTkT4tBeSLLiHzcwj6q0MmYSFCiVpiqPJTJEYIrpcPzVEIvg==", + "license": "MIT" + }, + "node_modules/delayed-stream": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", + "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==", + "license": "MIT", + "engines": { + "node": ">=0.4.0" + } + }, + "node_modules/didyoumean": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/didyoumean/-/didyoumean-1.2.2.tgz", + "integrity": "sha512-gxtyfqMg7GKyhQmb056K7M3xszy/myH8w+B4RT+QXBQsvAOdc3XymqDDPHx1BgPgsdAA5SIifona89YtRATDzw==", + "dev": true, + "license": "Apache-2.0" + }, + "node_modules/dlv": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/dlv/-/dlv-1.1.3.tgz", + "integrity": "sha512-+HlytyjlPKnIG8XuRG8WvmBP8xs8P71y+SKKS6ZXWoEgLuePxtDoUEiH7WkdePWrQ5JBpE6aoVqfZfJUQkjXwA==", + "dev": true, + "license": "MIT" + }, + "node_modules/dunder-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", + "integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==", + "license": "MIT", + "dependencies": { + "call-bind-apply-helpers": "^1.0.1", + "es-errors": "^1.3.0", + "gopd": "^1.2.0" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/electron-to-chromium": { + "version": "1.5.278", + "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.278.tgz", + "integrity": "sha512-dQ0tM1svDRQOwxnXxm+twlGTjr9Upvt8UFWAgmLsxEzFQxhbti4VwxmMjsDxVC51Zo84swW7FVCXEV+VAkhuPw==", + "dev": true, + "license": "ISC" + }, + "node_modules/es-define-property": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz", + "integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-errors": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz", + "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-object-atoms": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz", + "integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-set-tostringtag": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz", + "integrity": "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.6", + "has-tostringtag": "^1.0.2", + "hasown": "^2.0.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-toolkit": { + "version": "1.44.0", + "resolved": "https://registry.npmjs.org/es-toolkit/-/es-toolkit-1.44.0.tgz", + "integrity": "sha512-6penXeZalaV88MM3cGkFZZfOoLGWshWWfdy0tWw/RlVVyhvMaWSBTOvXNeiW3e5FwdS5ePW0LGEu17zT139ktg==", + "license": "MIT", + "workspaces": [ + "docs", + "benchmarks" + ] + }, + "node_modules/esbuild": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.25.12.tgz", + "integrity": "sha512-bbPBYYrtZbkt6Os6FiTLCTFxvq4tt3JKall1vRwshA3fdVztsLAatFaZobhkBC8/BrPetoa0oksYoKXoG4ryJg==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "bin": { + "esbuild": "bin/esbuild" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "@esbuild/aix-ppc64": "0.25.12", + "@esbuild/android-arm": "0.25.12", + "@esbuild/android-arm64": "0.25.12", + "@esbuild/android-x64": "0.25.12", + "@esbuild/darwin-arm64": "0.25.12", + "@esbuild/darwin-x64": "0.25.12", + "@esbuild/freebsd-arm64": "0.25.12", + "@esbuild/freebsd-x64": "0.25.12", + "@esbuild/linux-arm": "0.25.12", + "@esbuild/linux-arm64": "0.25.12", + "@esbuild/linux-ia32": "0.25.12", + "@esbuild/linux-loong64": "0.25.12", + "@esbuild/linux-mips64el": "0.25.12", + "@esbuild/linux-ppc64": "0.25.12", + "@esbuild/linux-riscv64": "0.25.12", + "@esbuild/linux-s390x": "0.25.12", + "@esbuild/linux-x64": "0.25.12", + "@esbuild/netbsd-arm64": "0.25.12", + "@esbuild/netbsd-x64": "0.25.12", + "@esbuild/openbsd-arm64": "0.25.12", + "@esbuild/openbsd-x64": "0.25.12", + "@esbuild/openharmony-arm64": "0.25.12", + "@esbuild/sunos-x64": "0.25.12", + "@esbuild/win32-arm64": "0.25.12", + "@esbuild/win32-ia32": "0.25.12", + "@esbuild/win32-x64": "0.25.12" + } + }, + "node_modules/escalade": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz", + "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/eventemitter3": { + "version": "5.0.4", + "resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-5.0.4.tgz", + "integrity": "sha512-mlsTRyGaPBjPedk6Bvw+aqbsXDtoAyAzm5MO7JgU+yVRyMQ5O8bD4Kcci7BS85f93veegeCPkL8R4GLClnjLFw==", + "license": "MIT" + }, + "node_modules/fast-glob": { + "version": "3.3.3", + "resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-3.3.3.tgz", + "integrity": "sha512-7MptL8U0cqcFdzIzwOTHoilX9x5BrNqye7Z/LuC7kCMRio1EMSyqRK3BEAUD7sXRq4iT4AzTVuZdhgQ2TCvYLg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@nodelib/fs.stat": "^2.0.2", + "@nodelib/fs.walk": "^1.2.3", + "glob-parent": "^5.1.2", + "merge2": "^1.3.0", + "micromatch": "^4.0.8" + }, + "engines": { + "node": ">=8.6.0" + } + }, + "node_modules/fast-glob/node_modules/glob-parent": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz", + "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==", + "dev": true, + "license": "ISC", + "dependencies": { + "is-glob": "^4.0.1" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/fastq": { + "version": "1.20.1", + "resolved": "https://registry.npmjs.org/fastq/-/fastq-1.20.1.tgz", + "integrity": "sha512-GGToxJ/w1x32s/D2EKND7kTil4n8OVk/9mycTc4VDza13lOvpUZTGX3mFSCtV9ksdGBVzvsyAVLM6mHFThxXxw==", + "dev": true, + "license": "ISC", + "dependencies": { + "reusify": "^1.0.4" + } + }, + "node_modules/fill-range": { + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz", + "integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==", + "dev": true, + "license": "MIT", + "dependencies": { + "to-regex-range": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/follow-redirects": { + "version": "1.15.11", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.11.tgz", + "integrity": "sha512-deG2P0JfjrTxl50XGCDyfI97ZGVCxIpfKYmfyrQ54n5FO/0gfIES8C/Psl6kWVDolizcaaxZJnTS0QSMxvnsBQ==", + "funding": [ + { + "type": "individual", + "url": "https://github.com/sponsors/RubenVerborgh" + } + ], + "license": "MIT", + "engines": { + "node": ">=4.0" + }, + "peerDependenciesMeta": { + "debug": { + "optional": true + } + } + }, + "node_modules/form-data": { + "version": "4.0.5", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.5.tgz", + "integrity": "sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w==", + "license": "MIT", + "dependencies": { + "asynckit": "^0.4.0", + "combined-stream": "^1.0.8", + "es-set-tostringtag": "^2.1.0", + "hasown": "^2.0.2", + "mime-types": "^2.1.12" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/fraction.js": { + "version": "5.3.4", + "resolved": "https://registry.npmjs.org/fraction.js/-/fraction.js-5.3.4.tgz", + "integrity": "sha512-1X1NTtiJphryn/uLQz3whtY6jK3fTqoE3ohKs0tT+Ujr1W59oopxmoEh7Lu5p6vBaPbgoM0bzveAW4Qi5RyWDQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": "*" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/rawify" + } + }, + "node_modules/fsevents": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", + "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/function-bind": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz", + "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==", + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/gensync": { + "version": "1.0.0-beta.2", + "resolved": "https://registry.npmjs.org/gensync/-/gensync-1.0.0-beta.2.tgz", + "integrity": "sha512-3hN7NaskYvMDLQY55gnW3NQ+mesEAepTqlg+VEbj7zzqEMBVNhzcGYYeqFo/TlYz6eQiFcp1HcsCZO+nGgS8zg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/get-intrinsic": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz", + "integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==", + "license": "MIT", + "dependencies": { + "call-bind-apply-helpers": "^1.0.2", + "es-define-property": "^1.0.1", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.1.1", + "function-bind": "^1.1.2", + "get-proto": "^1.0.1", + "gopd": "^1.2.0", + "has-symbols": "^1.1.0", + "hasown": "^2.0.2", + "math-intrinsics": "^1.1.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/get-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz", + "integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==", + "license": "MIT", + "dependencies": { + "dunder-proto": "^1.0.1", + "es-object-atoms": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/glob-parent": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-6.0.2.tgz", + "integrity": "sha512-XxwI8EOhVQgWp6iDL+3b0r86f4d6AX6zSU55HfB4ydCEuXLXc5FcYeOu+nnGftS4TEju/11rt4KJPTMgbfmv4A==", + "dev": true, + "license": "ISC", + "dependencies": { + "is-glob": "^4.0.3" + }, + "engines": { + "node": ">=10.13.0" + } + }, + "node_modules/gopd": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz", + "integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/has-symbols": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz", + "integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/has-tostringtag": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz", + "integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==", + "license": "MIT", + "dependencies": { + "has-symbols": "^1.0.3" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/hasown": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz", + "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==", + "license": "MIT", + "dependencies": { + "function-bind": "^1.1.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/immer": { + "version": "10.2.0", + "resolved": "https://registry.npmjs.org/immer/-/immer-10.2.0.tgz", + "integrity": "sha512-d/+XTN3zfODyjr89gM3mPq1WNX2B8pYsu7eORitdwyA2sBubnTl3laYlBk4sXY5FUa5qTZGBDPJICVbvqzjlbw==", + "license": "MIT", + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/immer" + } + }, + "node_modules/internmap": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/internmap/-/internmap-2.0.3.tgz", + "integrity": "sha512-5Hh7Y1wQbvY5ooGgPbDaL5iYLAPzMTUrjMulskHLH6wnv/A+1q5rgEaiuqEjB+oxGXIVZs1FF+R/KPN3ZSQYYg==", + "license": "ISC", + "engines": { + "node": ">=12" + } + }, + "node_modules/is-binary-path": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/is-binary-path/-/is-binary-path-2.1.0.tgz", + "integrity": "sha512-ZMERYes6pDydyuGidse7OsHxtbI7WVeUEozgR/g7rd0xUimYNlvZRE/K2MgZTjWy725IfelLeVcEM97mmtRGXw==", + "dev": true, + "license": "MIT", + "dependencies": { + "binary-extensions": "^2.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/is-core-module": { + "version": "2.16.1", + "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.16.1.tgz", + "integrity": "sha512-UfoeMA6fIJ8wTYFEUjelnaGI67v6+N7qXJEvQuIGa99l4xsCruSYOVSQ0uPANn4dAzm8lkYPaKLrrijLq7x23w==", + "dev": true, + "license": "MIT", + "dependencies": { + "hasown": "^2.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-extglob": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", + "integrity": "sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/is-glob": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz", + "integrity": "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==", + "dev": true, + "license": "MIT", + "dependencies": { + "is-extglob": "^2.1.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/is-number": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz", + "integrity": "sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.12.0" + } + }, + "node_modules/jiti": { + "version": "1.21.7", + "resolved": "https://registry.npmjs.org/jiti/-/jiti-1.21.7.tgz", + "integrity": "sha512-/imKNG4EbWNrVjoNC/1H5/9GFy+tqjGBHCaSsN+P2RnPqjsLmv6UD3Ej+Kj8nBWaRAwyk7kK5ZUc+OEatnTR3A==", + "dev": true, + "license": "MIT", + "bin": { + "jiti": "bin/jiti.js" + } + }, + "node_modules/js-tokens": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", + "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/jsesc": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-3.1.0.tgz", + "integrity": "sha512-/sM3dO2FOzXjKQhJuo0Q173wf2KOo8t4I8vHy6lF9poUp7bKT0/NHE8fPX23PwfhnykfqnC2xRxOnVw5XuGIaA==", + "dev": true, + "license": "MIT", + "bin": { + "jsesc": "bin/jsesc" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/json5": { + "version": "2.2.3", + "resolved": "https://registry.npmjs.org/json5/-/json5-2.2.3.tgz", + "integrity": "sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==", + "dev": true, + "license": "MIT", + "bin": { + "json5": "lib/cli.js" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/lilconfig": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-3.1.3.tgz", + "integrity": "sha512-/vlFKAoH5Cgt3Ie+JLhRbwOsCQePABiU3tJ1egGvyQ+33R/vcwM2Zl2QR/LzjsBeItPt3oSVXapn+m4nQDvpzw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=14" + }, + "funding": { + "url": "https://github.com/sponsors/antonk52" + } + }, + "node_modules/lines-and-columns": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz", + "integrity": "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==", + "dev": true, + "license": "MIT" + }, + "node_modules/lru-cache": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", + "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==", + "dev": true, + "license": "ISC", + "dependencies": { + "yallist": "^3.0.2" + } + }, + "node_modules/lucide-react": { + "version": "0.563.0", + "resolved": "https://registry.npmjs.org/lucide-react/-/lucide-react-0.563.0.tgz", + "integrity": "sha512-8dXPB2GI4dI8jV4MgUDGBeLdGk8ekfqVZ0BdLcrRzocGgG75ltNEmWS+gE7uokKF/0oSUuczNDT+g9hFJ23FkA==", + "license": "ISC", + "peerDependencies": { + "react": "^16.5.1 || ^17.0.0 || ^18.0.0 || ^19.0.0" + } + }, + "node_modules/math-intrinsics": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz", + "integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/merge2": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/merge2/-/merge2-1.4.1.tgz", + "integrity": "sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 8" + } + }, + "node_modules/micromatch": { + "version": "4.0.8", + "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.8.tgz", + "integrity": "sha512-PXwfBhYu0hBCPw8Dn0E+WDYb7af3dSLVWKi3HGv84IdF4TyFoC0ysxFd0Goxw7nSv4T/PzEJQxsYsEiFCKo2BA==", + "dev": true, + "license": "MIT", + "dependencies": { + "braces": "^3.0.3", + "picomatch": "^2.3.1" + }, + "engines": { + "node": ">=8.6" + } + }, + "node_modules/mime-db": { + "version": "1.52.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", + "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==", + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/mime-types": { + "version": "2.1.35", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", + "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", + "license": "MIT", + "dependencies": { + "mime-db": "1.52.0" + }, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", + "dev": true, + "license": "MIT" + }, + "node_modules/mz": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/mz/-/mz-2.7.0.tgz", + "integrity": "sha512-z81GNO7nnYMEhrGh9LeymoE4+Yr0Wn5McHIZMK5cfQCl+NDX08sCZgUc9/6MHni9IWuFLm1Z3HTCXu2z9fN62Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "any-promise": "^1.0.0", + "object-assign": "^4.0.1", + "thenify-all": "^1.0.0" + } + }, + "node_modules/nanoid": { + "version": "3.3.11", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz", + "integrity": "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "bin": { + "nanoid": "bin/nanoid.cjs" + }, + "engines": { + "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" + } + }, + "node_modules/node-releases": { + "version": "2.0.27", + "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.27.tgz", + "integrity": "sha512-nmh3lCkYZ3grZvqcCH+fjmQ7X+H0OeZgP40OierEaAptX4XofMh5kwNbWh7lBduUzCcV/8kZ+NDLCwm2iorIlA==", + "dev": true, + "license": "MIT" + }, + "node_modules/normalize-path": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-3.0.0.tgz", + "integrity": "sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/object-assign": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", + "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/object-hash": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/object-hash/-/object-hash-3.0.0.tgz", + "integrity": "sha512-RSn9F68PjH9HqtltsSnqYC1XXoWe9Bju5+213R98cNGttag9q9yAOTzdbsqvIa7aNm5WffBZFpWYr2aWrklWAw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 6" + } + }, + "node_modules/path-parse": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/path-parse/-/path-parse-1.0.7.tgz", + "integrity": "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw==", + "dev": true, + "license": "MIT" + }, + "node_modules/picocolors": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz", + "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==", + "dev": true, + "license": "ISC" + }, + "node_modules/picomatch": { + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz", + "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=8.6" + }, + "funding": { + "url": "https://github.com/sponsors/jonschlinkert" + } + }, + "node_modules/pify": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz", + "integrity": "sha512-udgsAY+fTnvv7kI7aaxbqwWNb0AHiB0qBO89PZKPkoTmGOgdbrHDKD+0B2X4uTfJ/FT1R09r9gTsjUjNJotuog==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/pirates": { + "version": "4.0.7", + "resolved": "https://registry.npmjs.org/pirates/-/pirates-4.0.7.tgz", + "integrity": "sha512-TfySrs/5nm8fQJDcBDuUng3VOUKsd7S+zqvbOTiGXHfxX4wK31ard+hoNuvkicM/2YFzlpDgABOevKSsB4G/FA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 6" + } + }, + "node_modules/postcss": { + "version": "8.5.6", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.6.tgz", + "integrity": "sha512-3Ybi1tAuwAP9s0r1UQ2J4n5Y0G05bJkpUIO0/bI9MhwmD70S5aTWbXGBwxHrelT+XM1k6dM0pk+SwNkpTRN7Pg==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/postcss" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "nanoid": "^3.3.11", + "picocolors": "^1.1.1", + "source-map-js": "^1.2.1" + }, + "engines": { + "node": "^10 || ^12 || >=14" + } + }, + "node_modules/postcss-import": { + "version": "15.1.0", + "resolved": "https://registry.npmjs.org/postcss-import/-/postcss-import-15.1.0.tgz", + "integrity": "sha512-hpr+J05B2FVYUAXHeK1YyI267J/dDDhMU6B6civm8hSY1jYJnBXxzKDKDswzJmtLHryrjhnDjqqp/49t8FALew==", + "dev": true, + "license": "MIT", + "dependencies": { + "postcss-value-parser": "^4.0.0", + "read-cache": "^1.0.0", + "resolve": "^1.1.7" + }, + "engines": { + "node": ">=14.0.0" + }, + "peerDependencies": { + "postcss": "^8.0.0" + } + }, + "node_modules/postcss-js": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/postcss-js/-/postcss-js-4.1.0.tgz", + "integrity": "sha512-oIAOTqgIo7q2EOwbhb8UalYePMvYoIeRY2YKntdpFQXNosSu3vLrniGgmH9OKs/qAkfoj5oB3le/7mINW1LCfw==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "camelcase-css": "^2.0.1" + }, + "engines": { + "node": "^12 || ^14 || >= 16" + }, + "peerDependencies": { + "postcss": "^8.4.21" + } + }, + "node_modules/postcss-load-config": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/postcss-load-config/-/postcss-load-config-6.0.1.tgz", + "integrity": "sha512-oPtTM4oerL+UXmx+93ytZVN82RrlY/wPUV8IeDxFrzIjXOLF1pN+EmKPLbubvKHT2HC20xXsCAH2Z+CKV6Oz/g==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "lilconfig": "^3.1.1" + }, + "engines": { + "node": ">= 18" + }, + "peerDependencies": { + "jiti": ">=1.21.0", + "postcss": ">=8.0.9", + "tsx": "^4.8.1", + "yaml": "^2.4.2" + }, + "peerDependenciesMeta": { + "jiti": { + "optional": true + }, + "postcss": { + "optional": true + }, + "tsx": { + "optional": true + }, + "yaml": { + "optional": true + } + } + }, + "node_modules/postcss-nested": { + "version": "6.2.0", + "resolved": "https://registry.npmjs.org/postcss-nested/-/postcss-nested-6.2.0.tgz", + "integrity": "sha512-HQbt28KulC5AJzG+cZtj9kvKB93CFCdLvog1WFLf1D+xmMvPGlBstkpTEZfK5+AN9hfJocyBFCNiqyS48bpgzQ==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "postcss-selector-parser": "^6.1.1" + }, + "engines": { + "node": ">=12.0" + }, + "peerDependencies": { + "postcss": "^8.2.14" + } + }, + "node_modules/postcss-selector-parser": { + "version": "6.1.2", + "resolved": "https://registry.npmjs.org/postcss-selector-parser/-/postcss-selector-parser-6.1.2.tgz", + "integrity": "sha512-Q8qQfPiZ+THO/3ZrOrO0cJJKfpYCagtMUkXbnEfmgUjwXg6z/WBeOyS9APBBPCTSiDV+s4SwQGu8yFsiMRIudg==", + "dev": true, + "license": "MIT", + "dependencies": { + "cssesc": "^3.0.0", + "util-deprecate": "^1.0.2" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/postcss-value-parser": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/postcss-value-parser/-/postcss-value-parser-4.2.0.tgz", + "integrity": "sha512-1NNCs6uurfkVbeXG4S8JFT9t19m45ICnif8zWLd5oPSZ50QnwMfK+H3jv408d4jw/7Bttv5axS5IiHoLaVNHeQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/proxy-from-env": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", + "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==", + "license": "MIT" + }, + "node_modules/queue-microtask": { + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz", + "integrity": "sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT" + }, + "node_modules/react": { + "version": "19.2.3", + "resolved": "https://registry.npmjs.org/react/-/react-19.2.3.tgz", + "integrity": "sha512-Ku/hhYbVjOQnXDZFv2+RibmLFGwFdeeKHFcOTlrt7xplBnya5OGn/hIRDsqDiSUcfORsDC7MPxwork8jBwsIWA==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/react-dom": { + "version": "19.2.3", + "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-19.2.3.tgz", + "integrity": "sha512-yELu4WmLPw5Mr/lmeEpox5rw3RETacE++JgHqQzd2dg+YbJuat3jH4ingc+WPZhxaoFzdv9y33G+F7Nl5O0GBg==", + "license": "MIT", + "dependencies": { + "scheduler": "^0.27.0" + }, + "peerDependencies": { + "react": "^19.2.3" + } + }, + "node_modules/react-is": { + "version": "19.2.3", + "resolved": "https://registry.npmjs.org/react-is/-/react-is-19.2.3.tgz", + "integrity": "sha512-qJNJfu81ByyabuG7hPFEbXqNcWSU3+eVus+KJs+0ncpGfMyYdvSmxiJxbWR65lYi1I+/0HBcliO029gc4F+PnA==", + "license": "MIT", + "peer": true + }, + "node_modules/react-redux": { + "version": "9.2.0", + "resolved": "https://registry.npmjs.org/react-redux/-/react-redux-9.2.0.tgz", + "integrity": "sha512-ROY9fvHhwOD9ySfrF0wmvu//bKCQ6AeZZq1nJNtbDC+kk5DuSuNX/n6YWYF/SYy7bSba4D4FSz8DJeKY/S/r+g==", + "license": "MIT", + "dependencies": { + "@types/use-sync-external-store": "^0.0.6", + "use-sync-external-store": "^1.4.0" + }, + "peerDependencies": { + "@types/react": "^18.2.25 || ^19", + "react": "^18.0 || ^19", + "redux": "^5.0.0" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "redux": { + "optional": true + } + } + }, + "node_modules/react-refresh": { + "version": "0.18.0", + "resolved": "https://registry.npmjs.org/react-refresh/-/react-refresh-0.18.0.tgz", + "integrity": "sha512-QgT5//D3jfjJb6Gsjxv0Slpj23ip+HtOpnNgnb2S5zU3CB26G/IDPGoy4RJB42wzFE46DRsstbW6tKHoKbhAxw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/react-router": { + "version": "6.30.3", + "resolved": "https://registry.npmjs.org/react-router/-/react-router-6.30.3.tgz", + "integrity": "sha512-XRnlbKMTmktBkjCLE8/XcZFlnHvr2Ltdr1eJX4idL55/9BbORzyZEaIkBFDhFGCEWBBItsVrDxwx3gnisMitdw==", + "license": "MIT", + "dependencies": { + "@remix-run/router": "1.23.2" + }, + "engines": { + "node": ">=14.0.0" + }, + "peerDependencies": { + "react": ">=16.8" + } + }, + "node_modules/react-router-dom": { + "version": "6.30.3", + "resolved": "https://registry.npmjs.org/react-router-dom/-/react-router-dom-6.30.3.tgz", + "integrity": "sha512-pxPcv1AczD4vso7G4Z3TKcvlxK7g7TNt3/FNGMhfqyntocvYKj+GCatfigGDjbLozC4baguJ0ReCigoDJXb0ag==", + "license": "MIT", + "dependencies": { + "@remix-run/router": "1.23.2", + "react-router": "6.30.3" + }, + "engines": { + "node": ">=14.0.0" + }, + "peerDependencies": { + "react": ">=16.8", + "react-dom": ">=16.8" + } + }, + "node_modules/read-cache": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/read-cache/-/read-cache-1.0.0.tgz", + "integrity": "sha512-Owdv/Ft7IjOgm/i0xvNDZ1LrRANRfew4b2prF3OWMQLxLfu3bS8FVhCsrSCMK4lR56Y9ya+AThoTpDCTxCmpRA==", + "dev": true, + "license": "MIT", + "dependencies": { + "pify": "^2.3.0" + } + }, + "node_modules/readdirp": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-3.6.0.tgz", + "integrity": "sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==", + "dev": true, + "license": "MIT", + "dependencies": { + "picomatch": "^2.2.1" + }, + "engines": { + "node": ">=8.10.0" + } + }, + "node_modules/recharts": { + "version": "3.7.0", + "resolved": "https://registry.npmjs.org/recharts/-/recharts-3.7.0.tgz", + "integrity": "sha512-l2VCsy3XXeraxIID9fx23eCb6iCBsxUQDnE8tWm6DFdszVAO7WVY/ChAD9wVit01y6B2PMupYiMmQwhgPHc9Ew==", + "license": "MIT", + "workspaces": [ + "www" + ], + "dependencies": { + "@reduxjs/toolkit": "1.x.x || 2.x.x", + "clsx": "^2.1.1", + "decimal.js-light": "^2.5.1", + "es-toolkit": "^1.39.3", + "eventemitter3": "^5.0.1", + "immer": "^10.1.1", + "react-redux": "8.x.x || 9.x.x", + "reselect": "5.1.1", + "tiny-invariant": "^1.3.3", + "use-sync-external-store": "^1.2.2", + "victory-vendor": "^37.0.2" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0", + "react-dom": "^16.0.0 || ^17.0.0 || ^18.0.0 || ^19.0.0", + "react-is": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0" + } + }, + "node_modules/redux": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/redux/-/redux-5.0.1.tgz", + "integrity": "sha512-M9/ELqF6fy8FwmkpnF0S3YKOqMyoWJ4+CS5Efg2ct3oY9daQvd/Pc71FpGZsVsbl3Cpb+IIcjBDUnnyBdQbq4w==", + "license": "MIT" + }, + "node_modules/redux-thunk": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/redux-thunk/-/redux-thunk-3.1.0.tgz", + "integrity": "sha512-NW2r5T6ksUKXCabzhL9z+h206HQw/NJkcLm1GPImRQ8IzfXwRGqjVhKJGauHirT0DAuyy6hjdnMZaRoAcy0Klw==", + "license": "MIT", + "peerDependencies": { + "redux": "^5.0.0" + } + }, + "node_modules/reselect": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/reselect/-/reselect-5.1.1.tgz", + "integrity": "sha512-K/BG6eIky/SBpzfHZv/dd+9JBFiS4SWV7FIujVyJRux6e45+73RaUHXLmIR1f7WOMaQ0U1km6qwklRQxpJJY0w==", + "license": "MIT" + }, + "node_modules/resolve": { + "version": "1.22.11", + "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.11.tgz", + "integrity": "sha512-RfqAvLnMl313r7c9oclB1HhUEAezcpLjz95wFH4LVuhk9JF/r22qmVP9AMmOU4vMX7Q8pN8jwNg/CSpdFnMjTQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "is-core-module": "^2.16.1", + "path-parse": "^1.0.7", + "supports-preserve-symlinks-flag": "^1.0.0" + }, + "bin": { + "resolve": "bin/resolve" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/reusify": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/reusify/-/reusify-1.1.0.tgz", + "integrity": "sha512-g6QUff04oZpHs0eG5p83rFLhHeV00ug/Yf9nZM6fLeUrPguBTkTQOdpAWWspMh55TZfVQDPaN3NQJfbVRAxdIw==", + "dev": true, + "license": "MIT", + "engines": { + "iojs": ">=1.0.0", + "node": ">=0.10.0" + } + }, + "node_modules/rollup": { + "version": "4.56.0", + "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.56.0.tgz", + "integrity": "sha512-9FwVqlgUHzbXtDg9RCMgodF3Ua4Na6Gau+Sdt9vyCN4RhHfVKX2DCHy3BjMLTDd47ITDhYAnTwGulWTblJSDLg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/estree": "1.0.8" + }, + "bin": { + "rollup": "dist/bin/rollup" + }, + "engines": { + "node": ">=18.0.0", + "npm": ">=8.0.0" + }, + "optionalDependencies": { + "@rollup/rollup-android-arm-eabi": "4.56.0", + "@rollup/rollup-android-arm64": "4.56.0", + "@rollup/rollup-darwin-arm64": "4.56.0", + "@rollup/rollup-darwin-x64": "4.56.0", + "@rollup/rollup-freebsd-arm64": "4.56.0", + "@rollup/rollup-freebsd-x64": "4.56.0", + "@rollup/rollup-linux-arm-gnueabihf": "4.56.0", + "@rollup/rollup-linux-arm-musleabihf": "4.56.0", + "@rollup/rollup-linux-arm64-gnu": "4.56.0", + "@rollup/rollup-linux-arm64-musl": "4.56.0", + "@rollup/rollup-linux-loong64-gnu": "4.56.0", + "@rollup/rollup-linux-loong64-musl": "4.56.0", + "@rollup/rollup-linux-ppc64-gnu": "4.56.0", + "@rollup/rollup-linux-ppc64-musl": "4.56.0", + "@rollup/rollup-linux-riscv64-gnu": "4.56.0", + "@rollup/rollup-linux-riscv64-musl": "4.56.0", + "@rollup/rollup-linux-s390x-gnu": "4.56.0", + "@rollup/rollup-linux-x64-gnu": "4.56.0", + "@rollup/rollup-linux-x64-musl": "4.56.0", + "@rollup/rollup-openbsd-x64": "4.56.0", + "@rollup/rollup-openharmony-arm64": "4.56.0", + "@rollup/rollup-win32-arm64-msvc": "4.56.0", + "@rollup/rollup-win32-ia32-msvc": "4.56.0", + "@rollup/rollup-win32-x64-gnu": "4.56.0", + "@rollup/rollup-win32-x64-msvc": "4.56.0", + "fsevents": "~2.3.2" + } + }, + "node_modules/run-parallel": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz", + "integrity": "sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT", + "dependencies": { + "queue-microtask": "^1.2.2" + } + }, + "node_modules/scheduler": { + "version": "0.27.0", + "resolved": "https://registry.npmjs.org/scheduler/-/scheduler-0.27.0.tgz", + "integrity": "sha512-eNv+WrVbKu1f3vbYJT/xtiF5syA5HPIMtf9IgY/nKg0sWqzAUEvqY/xm7OcZc/qafLx/iO9FgOmeSAp4v5ti/Q==", + "license": "MIT" + }, + "node_modules/semver": { + "version": "6.3.1", + "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", + "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==", + "dev": true, + "license": "ISC", + "bin": { + "semver": "bin/semver.js" + } + }, + "node_modules/source-map-js": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", + "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", + "dev": true, + "license": "BSD-3-Clause", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/sucrase": { + "version": "3.35.1", + "resolved": "https://registry.npmjs.org/sucrase/-/sucrase-3.35.1.tgz", + "integrity": "sha512-DhuTmvZWux4H1UOnWMB3sk0sbaCVOoQZjv8u1rDoTV0HTdGem9hkAZtl4JZy8P2z4Bg0nT+YMeOFyVr4zcG5Tw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@jridgewell/gen-mapping": "^0.3.2", + "commander": "^4.0.0", + "lines-and-columns": "^1.1.6", + "mz": "^2.7.0", + "pirates": "^4.0.1", + "tinyglobby": "^0.2.11", + "ts-interface-checker": "^0.1.9" + }, + "bin": { + "sucrase": "bin/sucrase", + "sucrase-node": "bin/sucrase-node" + }, + "engines": { + "node": ">=16 || 14 >=14.17" + } + }, + "node_modules/supports-preserve-symlinks-flag": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/supports-preserve-symlinks-flag/-/supports-preserve-symlinks-flag-1.0.0.tgz", + "integrity": "sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/tailwindcss": { + "version": "3.4.19", + "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-3.4.19.tgz", + "integrity": "sha512-3ofp+LL8E+pK/JuPLPggVAIaEuhvIz4qNcf3nA1Xn2o/7fb7s/TYpHhwGDv1ZU3PkBluUVaF8PyCHcm48cKLWQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@alloc/quick-lru": "^5.2.0", + "arg": "^5.0.2", + "chokidar": "^3.6.0", + "didyoumean": "^1.2.2", + "dlv": "^1.1.3", + "fast-glob": "^3.3.2", + "glob-parent": "^6.0.2", + "is-glob": "^4.0.3", + "jiti": "^1.21.7", + "lilconfig": "^3.1.3", + "micromatch": "^4.0.8", + "normalize-path": "^3.0.0", + "object-hash": "^3.0.0", + "picocolors": "^1.1.1", + "postcss": "^8.4.47", + "postcss-import": "^15.1.0", + "postcss-js": "^4.0.1", + "postcss-load-config": "^4.0.2 || ^5.0 || ^6.0", + "postcss-nested": "^6.2.0", + "postcss-selector-parser": "^6.1.2", + "resolve": "^1.22.8", + "sucrase": "^3.35.0" + }, + "bin": { + "tailwind": "lib/cli.js", + "tailwindcss": "lib/cli.js" + }, + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/thenify": { + "version": "3.3.1", + "resolved": "https://registry.npmjs.org/thenify/-/thenify-3.3.1.tgz", + "integrity": "sha512-RVZSIV5IG10Hk3enotrhvz0T9em6cyHBLkH/YAZuKqd8hRkKhSfCGIcP2KUY0EPxndzANBmNllzWPwak+bheSw==", + "dev": true, + "license": "MIT", + "dependencies": { + "any-promise": "^1.0.0" + } + }, + "node_modules/thenify-all": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/thenify-all/-/thenify-all-1.6.0.tgz", + "integrity": "sha512-RNxQH/qI8/t3thXJDwcstUO4zeqo64+Uy/+sNVRBx4Xn2OX+OZ9oP+iJnNFqplFra2ZUVeKCSa2oVWi3T4uVmA==", + "dev": true, + "license": "MIT", + "dependencies": { + "thenify": ">= 3.1.0 < 4" + }, + "engines": { + "node": ">=0.8" + } + }, + "node_modules/tiny-invariant": { + "version": "1.3.3", + "resolved": "https://registry.npmjs.org/tiny-invariant/-/tiny-invariant-1.3.3.tgz", + "integrity": "sha512-+FbBPE1o9QAYvviau/qC5SE3caw21q3xkvWKBtja5vgqOWIHHJ3ioaq1VPfn/Szqctz2bU/oYeKd9/z5BL+PVg==", + "license": "MIT" + }, + "node_modules/tinyglobby": { + "version": "0.2.15", + "resolved": "https://registry.npmjs.org/tinyglobby/-/tinyglobby-0.2.15.tgz", + "integrity": "sha512-j2Zq4NyQYG5XMST4cbs02Ak8iJUdxRM0XI5QyxXuZOzKOINmWurp3smXu3y5wDcJrptwpSjgXHzIQxR0omXljQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "fdir": "^6.5.0", + "picomatch": "^4.0.3" + }, + "engines": { + "node": ">=12.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/SuperchupuDev" + } + }, + "node_modules/tinyglobby/node_modules/fdir": { + "version": "6.5.0", + "resolved": "https://registry.npmjs.org/fdir/-/fdir-6.5.0.tgz", + "integrity": "sha512-tIbYtZbucOs0BRGqPJkshJUYdL+SDH7dVM8gjy+ERp3WAUjLEFJE+02kanyHtwjWOnwrKYBiwAmM0p4kLJAnXg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12.0.0" + }, + "peerDependencies": { + "picomatch": "^3 || ^4" + }, + "peerDependenciesMeta": { + "picomatch": { + "optional": true + } + } + }, + "node_modules/tinyglobby/node_modules/picomatch": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.3.tgz", + "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/jonschlinkert" + } + }, + "node_modules/to-regex-range": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz", + "integrity": "sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "is-number": "^7.0.0" + }, + "engines": { + "node": ">=8.0" + } + }, + "node_modules/ts-interface-checker": { + "version": "0.1.13", + "resolved": "https://registry.npmjs.org/ts-interface-checker/-/ts-interface-checker-0.1.13.tgz", + "integrity": "sha512-Y/arvbn+rrz3JCKl9C4kVNfTfSm2/mEp5FSz5EsZSANGPSlQrpRI5M4PKF+mJnE52jOO90PnPSc3Ur3bTQw0gA==", + "dev": true, + "license": "Apache-2.0" + }, + "node_modules/typescript": { + "version": "5.8.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.8.3.tgz", + "integrity": "sha512-p1diW6TqL9L07nNxvRMM7hMMw4c5XOo/1ibL4aAIGmSAt9slTE1Xgw5KWuof2uTOvCg9BY7ZRi+GaF+7sfgPeQ==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/undici-types": { + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", + "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/update-browserslist-db": { + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.2.3.tgz", + "integrity": "sha512-Js0m9cx+qOgDxo0eMiFGEueWztz+d4+M3rGlmKPT+T4IS/jP4ylw3Nwpu6cpTTP8R1MAC1kF4VbdLt3ARf209w==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/browserslist" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/browserslist" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "escalade": "^3.2.0", + "picocolors": "^1.1.1" + }, + "bin": { + "update-browserslist-db": "cli.js" + }, + "peerDependencies": { + "browserslist": ">= 4.21.0" + } + }, + "node_modules/use-sync-external-store": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/use-sync-external-store/-/use-sync-external-store-1.6.0.tgz", + "integrity": "sha512-Pp6GSwGP/NrPIrxVFAIkOQeyw8lFenOHijQWkUTrDvrF4ALqylP2C/KCkeS9dpUM3KvYRQhna5vt7IL95+ZQ9w==", + "license": "MIT", + "peerDependencies": { + "react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0" + } + }, + "node_modules/util-deprecate": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", + "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==", + "dev": true, + "license": "MIT" + }, + "node_modules/victory-vendor": { + "version": "37.3.6", + "resolved": "https://registry.npmjs.org/victory-vendor/-/victory-vendor-37.3.6.tgz", + "integrity": "sha512-SbPDPdDBYp+5MJHhBCAyI7wKM3d5ivekigc2Dk2s7pgbZ9wIgIBYGVw4zGHBml/qTFbexrofXW6Gu4noGxrOwQ==", + "license": "MIT AND ISC", + "dependencies": { + "@types/d3-array": "^3.0.3", + "@types/d3-ease": "^3.0.0", + "@types/d3-interpolate": "^3.0.1", + "@types/d3-scale": "^4.0.2", + "@types/d3-shape": "^3.1.0", + "@types/d3-time": "^3.0.0", + "@types/d3-timer": "^3.0.0", + "d3-array": "^3.1.6", + "d3-ease": "^3.0.1", + "d3-interpolate": "^3.0.1", + "d3-scale": "^4.0.2", + "d3-shape": "^3.1.0", + "d3-time": "^3.0.0", + "d3-timer": "^3.0.1" + } + }, + "node_modules/vite": { + "version": "6.4.1", + "resolved": "https://registry.npmjs.org/vite/-/vite-6.4.1.tgz", + "integrity": "sha512-+Oxm7q9hDoLMyJOYfUYBuHQo+dkAloi33apOPP56pzj+vsdJDzr+j1NISE5pyaAuKL4A3UD34qd0lx5+kfKp2g==", + "dev": true, + "license": "MIT", + "dependencies": { + "esbuild": "^0.25.0", + "fdir": "^6.4.4", + "picomatch": "^4.0.2", + "postcss": "^8.5.3", + "rollup": "^4.34.9", + "tinyglobby": "^0.2.13" + }, + "bin": { + "vite": "bin/vite.js" + }, + "engines": { + "node": "^18.0.0 || ^20.0.0 || >=22.0.0" + }, + "funding": { + "url": "https://github.com/vitejs/vite?sponsor=1" + }, + "optionalDependencies": { + "fsevents": "~2.3.3" + }, + "peerDependencies": { + "@types/node": "^18.0.0 || ^20.0.0 || >=22.0.0", + "jiti": ">=1.21.0", + "less": "*", + "lightningcss": "^1.21.0", + "sass": "*", + "sass-embedded": "*", + "stylus": "*", + "sugarss": "*", + "terser": "^5.16.0", + "tsx": "^4.8.1", + "yaml": "^2.4.2" + }, + "peerDependenciesMeta": { + "@types/node": { + "optional": true + }, + "jiti": { + "optional": true + }, + "less": { + "optional": true + }, + "lightningcss": { + "optional": true + }, + "sass": { + "optional": true + }, + "sass-embedded": { + "optional": true + }, + "stylus": { + "optional": true + }, + "sugarss": { + "optional": true + }, + "terser": { + "optional": true + }, + "tsx": { + "optional": true + }, + "yaml": { + "optional": true + } + } + }, + "node_modules/vite/node_modules/fdir": { + "version": "6.5.0", + "resolved": "https://registry.npmjs.org/fdir/-/fdir-6.5.0.tgz", + "integrity": "sha512-tIbYtZbucOs0BRGqPJkshJUYdL+SDH7dVM8gjy+ERp3WAUjLEFJE+02kanyHtwjWOnwrKYBiwAmM0p4kLJAnXg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12.0.0" + }, + "peerDependencies": { + "picomatch": "^3 || ^4" + }, + "peerDependenciesMeta": { + "picomatch": { + "optional": true + } + } + }, + "node_modules/vite/node_modules/picomatch": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.3.tgz", + "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/jonschlinkert" + } + }, + "node_modules/yallist": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.1.1.tgz", + "integrity": "sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g==", + "dev": true, + "license": "ISC" + }, + "node_modules/zustand": { + "version": "4.5.7", + "resolved": "https://registry.npmjs.org/zustand/-/zustand-4.5.7.tgz", + "integrity": "sha512-CHOUy7mu3lbD6o6LJLfllpjkzhHXSBlX8B9+qPddUsIfeF5S/UZ5q0kmCsnRqT1UHFQZchNFDDzMbQsuesHWlw==", + "license": "MIT", + "dependencies": { + "use-sync-external-store": "^1.2.2" + }, + "engines": { + "node": ">=12.7.0" + }, + "peerDependencies": { + "@types/react": ">=16.8", + "immer": ">=9.0.6", + "react": ">=16.8" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "immer": { + "optional": true + }, + "react": { + "optional": true + } + } + } + } +} diff --git a/frontend/package.json b/frontend/package.json new file mode 100644 index 0000000..3fd68ad --- /dev/null +++ b/frontend/package.json @@ -0,0 +1,32 @@ +{ + "name": "graphite-annotator", + "private": true, + "version": "0.0.0", + "type": "module", + "scripts": { + "dev": "vite", + "build": "vite build", + "preview": "vite preview" + }, + "dependencies": { + "react": "^19.2.3", + "react-dom": "^19.2.3", + "lucide-react": "^0.563.0", + "recharts": "^3.7.0", + "axios": "^1.6.7", + "react-router-dom": "^6.22.0", + "zustand": "^4.5.0", + "@tanstack/react-query": "^5.20.0", + "date-fns": "^3.3.0", + "clsx": "^2.1.0" + }, + "devDependencies": { + "@types/node": "^22.14.0", + "@vitejs/plugin-react": "^5.0.0", + "typescript": "~5.8.2", + "vite": "^6.2.0", + "tailwindcss": "^3.4.1", + "autoprefixer": "^10.4.17", + "postcss": "^8.4.35" + } +} diff --git a/frontend/postcss.config.js b/frontend/postcss.config.js new file mode 100644 index 0000000..2e7af2b --- /dev/null +++ b/frontend/postcss.config.js @@ -0,0 +1,6 @@ +export default { + plugins: { + tailwindcss: {}, + autoprefixer: {}, + }, +} diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx new file mode 100644 index 0000000..93a0a08 --- /dev/null +++ b/frontend/src/App.tsx @@ -0,0 +1,73 @@ +import React, { useState, useEffect } from 'react' +import { Layout } from './components/Layout' +import { DashboardOverview } from './components/DashboardOverview' +import { Dashboard } from './components/Dashboard' +import { DocumentDetail } from './components/DocumentDetail' +import { Training } from './components/Training' +import { Models } from './components/Models' +import { Login } from './components/Login' +import { InferenceDemo } from './components/InferenceDemo' + +const App: React.FC = () => { + const [currentView, setCurrentView] = useState('dashboard') + const [selectedDocId, setSelectedDocId] = useState(null) + const [isAuthenticated, setIsAuthenticated] = useState(false) + + useEffect(() => { + const token = localStorage.getItem('admin_token') + setIsAuthenticated(!!token) + }, []) + + const handleNavigate = (view: string, docId?: string) => { + setCurrentView(view) + if (docId) { + setSelectedDocId(docId) + } + } + + const handleLogin = (token: string) => { + setIsAuthenticated(true) + } + + const handleLogout = () => { + localStorage.removeItem('admin_token') + setIsAuthenticated(false) + setCurrentView('documents') + } + + if (!isAuthenticated) { + return + } + + const renderContent = () => { + switch (currentView) { + case 'dashboard': + return + case 'documents': + return + case 'detail': + return ( + setCurrentView('documents')} + /> + ) + case 'demo': + return + case 'training': + return + case 'models': + return + default: + return + } + } + + return ( + + {renderContent()} + + ) +} + +export default App diff --git a/frontend/src/api/client.ts b/frontend/src/api/client.ts new file mode 100644 index 0000000..c1e54f6 --- /dev/null +++ b/frontend/src/api/client.ts @@ -0,0 +1,41 @@ +import axios, { AxiosInstance, AxiosError } from 'axios' + +const apiClient: AxiosInstance = axios.create({ + baseURL: import.meta.env.VITE_API_URL || 'http://localhost:8000', + headers: { + 'Content-Type': 'application/json', + }, + timeout: 30000, +}) + +apiClient.interceptors.request.use( + (config) => { + const token = localStorage.getItem('admin_token') + if (token) { + config.headers['X-Admin-Token'] = token + } + return config + }, + (error) => { + return Promise.reject(error) + } +) + +apiClient.interceptors.response.use( + (response) => response, + (error: AxiosError) => { + if (error.response?.status === 401) { + console.warn('Authentication required. Please set admin_token in localStorage.') + // Don't redirect to avoid infinite loop + // User should manually set: localStorage.setItem('admin_token', 'your-token') + } + + if (error.response?.status === 429) { + console.error('Rate limit exceeded') + } + + return Promise.reject(error) + } +) + +export default apiClient diff --git a/frontend/src/api/endpoints/annotations.ts b/frontend/src/api/endpoints/annotations.ts new file mode 100644 index 0000000..851194b --- /dev/null +++ b/frontend/src/api/endpoints/annotations.ts @@ -0,0 +1,66 @@ +import apiClient from '../client' +import type { + AnnotationItem, + CreateAnnotationRequest, + AnnotationOverrideRequest, +} from '../types' + +export const annotationsApi = { + list: async (documentId: string): Promise => { + const { data } = await apiClient.get( + `/api/v1/admin/documents/${documentId}/annotations` + ) + return data.annotations + }, + + create: async ( + documentId: string, + annotation: CreateAnnotationRequest + ): Promise => { + const { data } = await apiClient.post( + `/api/v1/admin/documents/${documentId}/annotations`, + annotation + ) + return data + }, + + update: async ( + documentId: string, + annotationId: string, + updates: Partial + ): Promise => { + const { data } = await apiClient.patch( + `/api/v1/admin/documents/${documentId}/annotations/${annotationId}`, + updates + ) + return data + }, + + delete: async (documentId: string, annotationId: string): Promise => { + await apiClient.delete( + `/api/v1/admin/documents/${documentId}/annotations/${annotationId}` + ) + }, + + verify: async ( + documentId: string, + annotationId: string + ): Promise<{ annotation_id: string; is_verified: boolean; message: string }> => { + const { data } = await apiClient.post( + `/api/v1/admin/documents/${documentId}/annotations/${annotationId}/verify` + ) + return data + }, + + override: async ( + documentId: string, + annotationId: string, + overrideData: AnnotationOverrideRequest + ): Promise<{ annotation_id: string; source: string; message: string }> => { + const { data } = await apiClient.patch( + `/api/v1/admin/documents/${documentId}/annotations/${annotationId}/override`, + overrideData + ) + return data + }, +} diff --git a/frontend/src/api/endpoints/documents.ts b/frontend/src/api/endpoints/documents.ts new file mode 100644 index 0000000..56e5627 --- /dev/null +++ b/frontend/src/api/endpoints/documents.ts @@ -0,0 +1,80 @@ +import apiClient from '../client' +import type { + DocumentListResponse, + DocumentDetailResponse, + DocumentItem, + UploadDocumentResponse, +} from '../types' + +export const documentsApi = { + list: async (params?: { + status?: string + limit?: number + offset?: number + }): Promise => { + const { data } = await apiClient.get('/api/v1/admin/documents', { params }) + return data + }, + + getDetail: async (documentId: string): Promise => { + const { data } = await apiClient.get(`/api/v1/admin/documents/${documentId}`) + return data + }, + + upload: async (file: File): Promise => { + const formData = new FormData() + formData.append('file', file) + + const { data } = await apiClient.post('/api/v1/admin/documents', formData, { + headers: { + 'Content-Type': 'multipart/form-data', + }, + }) + return data + }, + + batchUpload: async ( + files: File[], + csvFile?: File + ): Promise<{ batch_id: string; message: string; documents_created: number }> => { + const formData = new FormData() + + files.forEach((file) => { + formData.append('files', file) + }) + + if (csvFile) { + formData.append('csv_file', csvFile) + } + + const { data } = await apiClient.post('/api/v1/admin/batch/upload', formData, { + headers: { + 'Content-Type': 'multipart/form-data', + }, + }) + return data + }, + + delete: async (documentId: string): Promise => { + await apiClient.delete(`/api/v1/admin/documents/${documentId}`) + }, + + updateStatus: async ( + documentId: string, + status: string + ): Promise => { + const { data } = await apiClient.patch( + `/api/v1/admin/documents/${documentId}/status`, + null, + { params: { status } } + ) + return data + }, + + triggerAutoLabel: async (documentId: string): Promise<{ message: string }> => { + const { data } = await apiClient.post( + `/api/v1/admin/documents/${documentId}/auto-label` + ) + return data + }, +} diff --git a/frontend/src/api/endpoints/index.ts b/frontend/src/api/endpoints/index.ts new file mode 100644 index 0000000..f24f2f5 --- /dev/null +++ b/frontend/src/api/endpoints/index.ts @@ -0,0 +1,4 @@ +export { documentsApi } from './documents' +export { annotationsApi } from './annotations' +export { trainingApi } from './training' +export { inferenceApi } from './inference' diff --git a/frontend/src/api/endpoints/inference.ts b/frontend/src/api/endpoints/inference.ts new file mode 100644 index 0000000..5542506 --- /dev/null +++ b/frontend/src/api/endpoints/inference.ts @@ -0,0 +1,16 @@ +import apiClient from '../client' +import type { InferenceResponse } from '../types' + +export const inferenceApi = { + processDocument: async (file: File): Promise => { + const formData = new FormData() + formData.append('file', file) + + const { data } = await apiClient.post('/api/v1/infer', formData, { + headers: { + 'Content-Type': 'multipart/form-data', + }, + }) + return data + }, +} diff --git a/frontend/src/api/endpoints/training.ts b/frontend/src/api/endpoints/training.ts new file mode 100644 index 0000000..de9c107 --- /dev/null +++ b/frontend/src/api/endpoints/training.ts @@ -0,0 +1,74 @@ +import apiClient from '../client' +import type { TrainingModelsResponse, DocumentListResponse } from '../types' + +export const trainingApi = { + getDocumentsForTraining: async (params?: { + has_annotations?: boolean + min_annotation_count?: number + exclude_used_in_training?: boolean + limit?: number + offset?: number + }): Promise => { + const { data } = await apiClient.get('/api/v1/admin/training/documents', { + params, + }) + return data + }, + + getModels: async (params?: { + status?: string + limit?: number + offset?: number + }): Promise => { + const { data} = await apiClient.get('/api/v1/admin/training/models', { + params, + }) + return data + }, + + getTaskDetail: async (taskId: string) => { + const { data } = await apiClient.get(`/api/v1/admin/training/tasks/${taskId}`) + return data + }, + + startTraining: async (config: { + name: string + description?: string + document_ids: string[] + epochs?: number + batch_size?: number + model_base?: string + }) => { + // Convert frontend config to backend TrainingTaskCreate format + const taskRequest = { + name: config.name, + task_type: 'yolo', + description: config.description, + config: { + document_ids: config.document_ids, + epochs: config.epochs, + batch_size: config.batch_size, + base_model: config.model_base, + }, + } + const { data } = await apiClient.post('/api/v1/admin/training/tasks', taskRequest) + return data + }, + + cancelTask: async (taskId: string) => { + const { data } = await apiClient.post( + `/api/v1/admin/training/tasks/${taskId}/cancel` + ) + return data + }, + + downloadModel: async (taskId: string): Promise => { + const { data } = await apiClient.get( + `/api/v1/admin/training/models/${taskId}/download`, + { + responseType: 'blob', + } + ) + return data + }, +} diff --git a/frontend/src/api/types.ts b/frontend/src/api/types.ts new file mode 100644 index 0000000..c668a59 --- /dev/null +++ b/frontend/src/api/types.ts @@ -0,0 +1,173 @@ +export interface DocumentItem { + document_id: string + filename: string + file_size: number + content_type: string + page_count: number + status: 'pending' | 'labeled' | 'verified' | 'exported' + auto_label_status: 'pending' | 'running' | 'completed' | 'failed' | null + auto_label_error: string | null + upload_source: string + created_at: string + updated_at: string + annotation_count?: number + annotation_sources?: { + manual: number + auto: number + verified: number + } +} + +export interface DocumentListResponse { + documents: DocumentItem[] + total: number + limit: number + offset: number +} + +export interface AnnotationItem { + annotation_id: string + page_number: number + class_id: number + class_name: string + bbox: { + x: number + y: number + width: number + height: number + } + normalized_bbox: { + x_center: number + y_center: number + width: number + height: number + } + text_value: string | null + confidence: number | null + source: 'manual' | 'auto' + created_at: string +} + +export interface DocumentDetailResponse { + document_id: string + filename: string + file_size: number + content_type: string + page_count: number + status: 'pending' | 'labeled' | 'verified' | 'exported' + auto_label_status: 'pending' | 'running' | 'completed' | 'failed' | null + auto_label_error: string | null + upload_source: string + batch_id: string | null + csv_field_values: Record | null + can_annotate: boolean + annotation_lock_until: string | null + annotations: AnnotationItem[] + image_urls: string[] + training_history: Array<{ + task_id: string + name: string + trained_at: string + model_metrics: { + mAP: number | null + precision: number | null + recall: number | null + } | null + }> + created_at: string + updated_at: string +} + +export interface TrainingTask { + task_id: string + admin_token: string + name: string + description: string | null + status: 'pending' | 'running' | 'completed' | 'failed' + task_type: string + config: Record + started_at: string | null + completed_at: string | null + error_message: string | null + result_metrics: Record + model_path: string | null + document_count: number + metrics_mAP: number | null + metrics_precision: number | null + metrics_recall: number | null + created_at: string + updated_at: string +} + +export interface TrainingModelsResponse { + models: TrainingTask[] + total: number + limit: number + offset: number +} + +export interface ErrorResponse { + detail: string +} + +export interface UploadDocumentResponse { + document_id: string + filename: string + status: string + message: string +} + +export interface CreateAnnotationRequest { + page_number: number + class_id: number + bbox: { + x: number + y: number + width: number + height: number + } + text_value?: string +} + +export interface AnnotationOverrideRequest { + text_value?: string + bbox?: { + x: number + y: number + width: number + height: number + } + class_id?: number + class_name?: string + reason?: string +} + +export interface CrossValidationResult { + is_valid: boolean + payment_line_ocr: string | null + payment_line_amount: string | null + payment_line_account: string | null + payment_line_account_type: 'bankgiro' | 'plusgiro' | null + ocr_match: boolean | null + amount_match: boolean | null + bankgiro_match: boolean | null + plusgiro_match: boolean | null + details: string[] +} + +export interface InferenceResult { + document_id: string + document_type: string + success: boolean + fields: Record + confidence: Record + cross_validation: CrossValidationResult | null + processing_time_ms: number + visualization_url: string | null + errors: string[] + fallback_used: boolean +} + +export interface InferenceResponse { + result: InferenceResult +} diff --git a/frontend/src/components/Badge.tsx b/frontend/src/components/Badge.tsx new file mode 100644 index 0000000..cd0548f --- /dev/null +++ b/frontend/src/components/Badge.tsx @@ -0,0 +1,39 @@ +import React from 'react'; +import { DocumentStatus } from '../types'; +import { Check } from 'lucide-react'; + +interface BadgeProps { + status: DocumentStatus | 'Exported'; +} + +export const Badge: React.FC = ({ status }) => { + if (status === 'Exported') { + return ( + + + Exported + + ); + } + + const styles = { + [DocumentStatus.PENDING]: "bg-white border border-warm-divider text-warm-text-secondary", + [DocumentStatus.LABELED]: "bg-warm-text-secondary text-white border border-transparent", + [DocumentStatus.VERIFIED]: "bg-warm-state-success/10 text-warm-state-success border border-warm-state-success/20", + [DocumentStatus.PARTIAL]: "bg-warm-state-warning/10 text-warm-state-warning border border-warm-state-warning/20", + }; + + const icons = { + [DocumentStatus.VERIFIED]: , + [DocumentStatus.PARTIAL]: !, + [DocumentStatus.PENDING]: null, + [DocumentStatus.LABELED]: null, + } + + return ( + + {icons[status]} + {status} + + ); +}; \ No newline at end of file diff --git a/frontend/src/components/Button.tsx b/frontend/src/components/Button.tsx new file mode 100644 index 0000000..58c7d8c --- /dev/null +++ b/frontend/src/components/Button.tsx @@ -0,0 +1,38 @@ +import React from 'react'; + +interface ButtonProps extends React.ButtonHTMLAttributes { + variant?: 'primary' | 'secondary' | 'outline' | 'text'; + size?: 'sm' | 'md' | 'lg'; +} + +export const Button: React.FC = ({ + variant = 'primary', + size = 'md', + className = '', + children, + ...props +}) => { + const baseStyles = "inline-flex items-center justify-center rounded-md font-medium transition-all duration-150 ease-out active:scale-98 disabled:opacity-50 disabled:pointer-events-none"; + + const variants = { + primary: "bg-warm-text-secondary text-white hover:bg-warm-text-primary shadow-sm", + secondary: "bg-white border border-warm-divider text-warm-text-secondary hover:bg-warm-hover", + outline: "bg-transparent border border-warm-text-secondary text-warm-text-secondary hover:bg-warm-hover", + text: "text-warm-text-muted hover:text-warm-text-primary hover:bg-warm-hover", + }; + + const sizes = { + sm: "h-8 px-3 text-xs", + md: "h-10 px-4 text-sm", + lg: "h-12 px-6 text-base", + }; + + return ( + + ); +}; \ No newline at end of file diff --git a/frontend/src/components/Dashboard.tsx b/frontend/src/components/Dashboard.tsx new file mode 100644 index 0000000..02517d7 --- /dev/null +++ b/frontend/src/components/Dashboard.tsx @@ -0,0 +1,266 @@ +import React, { useState } from 'react' +import { Search, ChevronDown, MoreHorizontal, FileText } from 'lucide-react' +import { Badge } from './Badge' +import { Button } from './Button' +import { UploadModal } from './UploadModal' +import { useDocuments } from '../hooks/useDocuments' +import type { DocumentItem } from '../api/types' + +interface DashboardProps { + onNavigate: (view: string, docId?: string) => void +} + +const getStatusForBadge = (status: string): string => { + const statusMap: Record = { + pending: 'Pending', + labeled: 'Labeled', + verified: 'Verified', + exported: 'Exported', + } + return statusMap[status] || status +} + +const getAutoLabelProgress = (doc: DocumentItem): number | undefined => { + if (doc.auto_label_status === 'running') { + return 45 + } + if (doc.auto_label_status === 'completed') { + return 100 + } + return undefined +} + +export const Dashboard: React.FC = ({ onNavigate }) => { + const [isUploadOpen, setIsUploadOpen] = useState(false) + const [selectedDocs, setSelectedDocs] = useState>(new Set()) + const [statusFilter, setStatusFilter] = useState('') + const [limit] = useState(20) + const [offset] = useState(0) + + const { documents, total, isLoading, error, refetch } = useDocuments({ + status: statusFilter || undefined, + limit, + offset, + }) + + const toggleSelection = (id: string) => { + const newSet = new Set(selectedDocs) + if (newSet.has(id)) { + newSet.delete(id) + } else { + newSet.add(id) + } + setSelectedDocs(newSet) + } + + if (error) { + return ( +
+
+ Error loading documents. Please check your connection to the backend API. + +
+
+ ) + } + + return ( +
+
+
+

+ Documents +

+

+ {isLoading ? 'Loading...' : `${total} documents total`} +

+
+
+ + +
+
+ +
+
+ + +
+ +
+
+ + +
+
+
+ +
+ + + + + + + + + + + + + + {isLoading ? ( + + + + ) : documents.length === 0 ? ( + + + + ) : ( + documents.map((doc) => { + const isSelected = selectedDocs.has(doc.document_id) + const progress = getAutoLabelProgress(doc) + + return ( + onNavigate('detail', doc.document_id)} + className={` + group transition-colors duration-150 cursor-pointer border-b border-warm-border last:border-0 + ${isSelected ? 'bg-warm-selected' : 'hover:bg-warm-hover bg-white'} + `} + > + + + + + + + + + ) + }) + )} + +
+ + + Document Name + + Date + + Status + + Annotations + + Auto-label +
+ Loading documents... +
+ No documents found. Upload your first document to get started. +
{ + e.stopPropagation() + toggleSelection(doc.document_id) + }} + > + {isSelected && ( +
+ )} + +
+
+
+ +
+ + {doc.filename} + +
+
+ {new Date(doc.created_at).toLocaleDateString()} + + + + {doc.annotation_count || 0} annotations + + {doc.auto_label_status === 'running' && progress && ( +
+
+ + Running + + {progress}% +
+
+
+
+
+ )} + {doc.auto_label_status === 'completed' && ( + + Completed + + )} + {doc.auto_label_status === 'failed' && ( + + Failed + + )} +
+ +
+
+ + { + setIsUploadOpen(false) + refetch() + }} + /> +
+ ) +} diff --git a/frontend/src/components/DashboardOverview.tsx b/frontend/src/components/DashboardOverview.tsx new file mode 100644 index 0000000..6de1561 --- /dev/null +++ b/frontend/src/components/DashboardOverview.tsx @@ -0,0 +1,148 @@ +import React from 'react' +import { FileText, CheckCircle, Clock, TrendingUp, Activity } from 'lucide-react' +import { Button } from './Button' +import { useDocuments } from '../hooks/useDocuments' +import { useTraining } from '../hooks/useTraining' + +interface DashboardOverviewProps { + onNavigate: (view: string) => void +} + +export const DashboardOverview: React.FC = ({ onNavigate }) => { + const { total: totalDocs, isLoading: docsLoading } = useDocuments({ limit: 1 }) + const { models, isLoadingModels } = useTraining() + + const stats = [ + { + label: 'Total Documents', + value: docsLoading ? '...' : totalDocs.toString(), + icon: FileText, + color: 'text-warm-text-primary', + bgColor: 'bg-warm-bg', + }, + { + label: 'Labeled', + value: '0', + icon: CheckCircle, + color: 'text-warm-state-success', + bgColor: 'bg-green-50', + }, + { + label: 'Pending', + value: '0', + icon: Clock, + color: 'text-warm-state-warning', + bgColor: 'bg-yellow-50', + }, + { + label: 'Training Models', + value: isLoadingModels ? '...' : models.length.toString(), + icon: TrendingUp, + color: 'text-warm-state-info', + bgColor: 'bg-blue-50', + }, + ] + + return ( +
+ {/* Header */} +
+

+ Dashboard +

+

+ Overview of your document annotation system +

+
+ + {/* Stats Grid */} +
+ {stats.map((stat) => ( +
+
+
+ +
+
+

+ {stat.value} +

+

{stat.label}

+
+ ))} +
+ + {/* Quick Actions */} +
+

+ Quick Actions +

+
+ + + +
+
+ + {/* Recent Activity */} +
+
+

+ Recent Activity +

+
+
+
+ +

No recent activity

+

+ Start by uploading documents or creating training jobs +

+
+
+
+ + {/* System Status */} +
+

+ System Status +

+
+
+ Backend API + + + Online + +
+
+ Database + + + Connected + +
+
+ GPU + + + Available + +
+
+
+
+ ) +} diff --git a/frontend/src/components/DocumentDetail.tsx b/frontend/src/components/DocumentDetail.tsx new file mode 100644 index 0000000..cf4da73 --- /dev/null +++ b/frontend/src/components/DocumentDetail.tsx @@ -0,0 +1,504 @@ +import React, { useState, useRef, useEffect } from 'react' +import { ChevronLeft, ZoomIn, ZoomOut, Plus, Edit2, Trash2, Tag, CheckCircle } from 'lucide-react' +import { Button } from './Button' +import { useDocumentDetail } from '../hooks/useDocumentDetail' +import { useAnnotations } from '../hooks/useAnnotations' +import { documentsApi } from '../api/endpoints/documents' +import type { AnnotationItem } from '../api/types' + +interface DocumentDetailProps { + docId: string + onBack: () => void +} + +// Field class mapping from backend +const FIELD_CLASSES: Record = { + 0: 'invoice_number', + 1: 'invoice_date', + 2: 'invoice_due_date', + 3: 'ocr_number', + 4: 'bankgiro', + 5: 'plusgiro', + 6: 'amount', + 7: 'supplier_organisation_number', + 8: 'payment_line', + 9: 'customer_number', +} + +export const DocumentDetail: React.FC = ({ docId, onBack }) => { + const { document, annotations, isLoading } = useDocumentDetail(docId) + const { + createAnnotation, + updateAnnotation, + deleteAnnotation, + isCreating, + isDeleting, + } = useAnnotations(docId) + + const [selectedId, setSelectedId] = useState(null) + const [zoom, setZoom] = useState(100) + const [isDrawing, setIsDrawing] = useState(false) + const [drawStart, setDrawStart] = useState<{ x: number; y: number } | null>(null) + const [drawEnd, setDrawEnd] = useState<{ x: number; y: number } | null>(null) + const [selectedClassId, setSelectedClassId] = useState(0) + const [currentPage, setCurrentPage] = useState(1) + const [imageSize, setImageSize] = useState<{ width: number; height: number } | null>(null) + const [imageBlobUrl, setImageBlobUrl] = useState(null) + + const canvasRef = useRef(null) + const imageRef = useRef(null) + + const [isMarkingComplete, setIsMarkingComplete] = useState(false) + + const selectedAnnotation = annotations?.find((a) => a.annotation_id === selectedId) + + // Handle mark as complete + const handleMarkComplete = async () => { + if (!annotations || annotations.length === 0) { + alert('Please add at least one annotation before marking as complete.') + return + } + + if (!confirm('Mark this document as labeled? This will save annotations to the database.')) { + return + } + + setIsMarkingComplete(true) + try { + const result = await documentsApi.updateStatus(docId, 'labeled') + alert(`Document marked as labeled. ${(result as any).fields_saved || annotations.length} annotations saved.`) + onBack() // Return to document list + } catch (error) { + console.error('Failed to mark document as complete:', error) + alert('Failed to mark document as complete. Please try again.') + } finally { + setIsMarkingComplete(false) + } + } + + // Load image via fetch with authentication header + useEffect(() => { + let objectUrl: string | null = null + + const loadImage = async () => { + if (!docId) return + + const token = localStorage.getItem('admin_token') + const imageUrl = `${import.meta.env.VITE_API_URL || 'http://localhost:8000'}/api/v1/admin/documents/${docId}/images/${currentPage}` + + try { + const response = await fetch(imageUrl, { + headers: { + 'X-Admin-Token': token || '', + }, + }) + + if (!response.ok) { + throw new Error(`Failed to load image: ${response.status}`) + } + + const blob = await response.blob() + objectUrl = URL.createObjectURL(blob) + setImageBlobUrl(objectUrl) + } catch (error) { + console.error('Failed to load image:', error) + } + } + + loadImage() + + // Cleanup: revoke object URL when component unmounts or page changes + return () => { + if (objectUrl) { + URL.revokeObjectURL(objectUrl) + } + } + }, [currentPage, docId]) + + // Load image size + useEffect(() => { + if (imageRef.current && imageRef.current.complete) { + setImageSize({ + width: imageRef.current.naturalWidth, + height: imageRef.current.naturalHeight, + }) + } + }, [imageBlobUrl]) + + const handleImageLoad = () => { + if (imageRef.current) { + setImageSize({ + width: imageRef.current.naturalWidth, + height: imageRef.current.naturalHeight, + }) + } + } + + const handleMouseDown = (e: React.MouseEvent) => { + if (!canvasRef.current || !imageSize) return + const rect = canvasRef.current.getBoundingClientRect() + const x = (e.clientX - rect.left) / (zoom / 100) + const y = (e.clientY - rect.top) / (zoom / 100) + setIsDrawing(true) + setDrawStart({ x, y }) + setDrawEnd({ x, y }) + } + + const handleMouseMove = (e: React.MouseEvent) => { + if (!isDrawing || !canvasRef.current || !imageSize) return + const rect = canvasRef.current.getBoundingClientRect() + const x = (e.clientX - rect.left) / (zoom / 100) + const y = (e.clientY - rect.top) / (zoom / 100) + setDrawEnd({ x, y }) + } + + const handleMouseUp = () => { + if (!isDrawing || !drawStart || !drawEnd || !imageSize) { + setIsDrawing(false) + return + } + + const bbox_x = Math.min(drawStart.x, drawEnd.x) + const bbox_y = Math.min(drawStart.y, drawEnd.y) + const bbox_width = Math.abs(drawEnd.x - drawStart.x) + const bbox_height = Math.abs(drawEnd.y - drawStart.y) + + // Only create if box is large enough (min 10x10 pixels) + if (bbox_width > 10 && bbox_height > 10) { + createAnnotation({ + page_number: currentPage, + class_id: selectedClassId, + bbox: { + x: Math.round(bbox_x), + y: Math.round(bbox_y), + width: Math.round(bbox_width), + height: Math.round(bbox_height), + }, + }) + } + + setIsDrawing(false) + setDrawStart(null) + setDrawEnd(null) + } + + const handleDeleteAnnotation = (annotationId: string) => { + if (confirm('Are you sure you want to delete this annotation?')) { + deleteAnnotation(annotationId) + setSelectedId(null) + } + } + + if (isLoading || !document) { + return ( +
+
Loading...
+
+ ) + } + + // Get current page annotations + const pageAnnotations = annotations?.filter((a) => a.page_number === currentPage) || [] + + return ( +
+ {/* Main Canvas Area */} +
+ {/* Toolbar */} +
+
+ +
+

{document.filename}

+

+ Page {currentPage} of {document.page_count} +

+
+
+
+ + + {zoom}% + + +
+
+
+ + + {document.page_count > 1 && ( +
+ + +
+ )} +
+
+ + {/* Canvas Scroll Area */} +
+
setSelectedId(null)} + > + {/* Document Image */} + {imageBlobUrl ? ( + {`Page + ) : ( +
+
Loading image...
+
+ )} + + {/* Annotation Overlays */} + {pageAnnotations.map((ann) => { + const isSelected = selectedId === ann.annotation_id + return ( +
{ + e.stopPropagation() + setSelectedId(ann.annotation_id) + }} + className={` + absolute group cursor-pointer transition-all duration-100 + ${ + ann.source === 'auto' + ? 'border border-dashed border-warm-text-muted bg-transparent' + : 'border-2 border-warm-text-secondary bg-warm-text-secondary/5' + } + ${ + isSelected + ? 'border-2 border-warm-state-info ring-4 ring-warm-state-info/10 z-20' + : 'hover:bg-warm-state-info/5 z-10' + } + `} + style={{ + left: ann.bbox.x, + top: ann.bbox.y, + width: ann.bbox.width, + height: ann.bbox.height, + }} + > + {/* Label Tag */} +
+ {ann.class_name} +
+ + {/* Resize Handles (Visual only) */} + {isSelected && ( + <> +
+
+
+
+ + )} +
+ ) + })} + + {/* Drawing Box Preview */} + {isDrawing && drawStart && drawEnd && ( +
+ )} +
+
+
+ + {/* Right Sidebar */} +
+ {/* Field Selector */} +
+

Draw Annotation

+
+ + +

+ Click and drag on the document to create a bounding box +

+
+
+ + {/* Document Info Card */} +
+
+

Document Info

+
+
+ Status + + {document.status} + +
+
+ Size + + {(document.file_size / 1024 / 1024).toFixed(2)} MB + +
+
+ Uploaded + + {new Date(document.created_at).toLocaleDateString()} + +
+
+
+
+ + {/* Annotations List */} +
+
+

Annotations

+ {pageAnnotations.length} items +
+ + {pageAnnotations.length === 0 ? ( +
+ +

No annotations yet

+

Draw on the document to add annotations

+
+ ) : ( +
+ {pageAnnotations.map((ann) => ( +
setSelectedId(ann.annotation_id)} + className={` + group p-3 rounded-md border transition-all duration-150 cursor-pointer + ${ + selectedId === ann.annotation_id + ? 'bg-warm-bg border-warm-state-info shadow-sm' + : 'bg-white border-warm-border hover:border-warm-text-muted' + } + `} + > +
+ + {ann.class_name.replace(/_/g, ' ')} + + {selectedId === ann.annotation_id && ( +
+ +
+ )} +
+

+ {ann.text_value || '(no text)'} +

+
+ + {ann.source} + + {ann.confidence && ( + + {(ann.confidence * 100).toFixed(0)}% + + )} +
+
+ ))} +
+ )} +
+
+
+ ) +} \ No newline at end of file diff --git a/frontend/src/components/InferenceDemo.tsx b/frontend/src/components/InferenceDemo.tsx new file mode 100644 index 0000000..996bc6e --- /dev/null +++ b/frontend/src/components/InferenceDemo.tsx @@ -0,0 +1,466 @@ +import React, { useState, useRef } from 'react' +import { UploadCloud, FileText, Loader2, CheckCircle2, AlertCircle, Clock } from 'lucide-react' +import { Button } from './Button' +import { inferenceApi } from '../api/endpoints' +import type { InferenceResult } from '../api/types' + +export const InferenceDemo: React.FC = () => { + const [isDragging, setIsDragging] = useState(false) + const [selectedFile, setSelectedFile] = useState(null) + const [isProcessing, setIsProcessing] = useState(false) + const [result, setResult] = useState(null) + const [error, setError] = useState(null) + const fileInputRef = useRef(null) + + const handleFileSelect = (file: File | null) => { + if (!file) return + + const validTypes = ['application/pdf', 'image/png', 'image/jpeg', 'image/jpg'] + if (!validTypes.includes(file.type)) { + setError('Please upload a PDF, PNG, or JPG file') + return + } + + if (file.size > 50 * 1024 * 1024) { + setError('File size must be less than 50MB') + return + } + + setSelectedFile(file) + setResult(null) + setError(null) + } + + const handleDrop = (e: React.DragEvent) => { + e.preventDefault() + setIsDragging(false) + if (e.dataTransfer.files.length > 0) { + handleFileSelect(e.dataTransfer.files[0]) + } + } + + const handleBrowseClick = () => { + fileInputRef.current?.click() + } + + const handleProcess = async () => { + if (!selectedFile) return + + setIsProcessing(true) + setError(null) + + try { + const response = await inferenceApi.processDocument(selectedFile) + console.log('API Response:', response) + console.log('Visualization URL:', response.result?.visualization_url) + setResult(response.result) + } catch (err) { + setError(err instanceof Error ? err.message : 'Processing failed') + } finally { + setIsProcessing(false) + } + } + + const handleReset = () => { + setSelectedFile(null) + setResult(null) + setError(null) + } + + const formatFieldName = (field: string): string => { + const fieldNames: Record = { + InvoiceNumber: 'Invoice Number', + InvoiceDate: 'Invoice Date', + InvoiceDueDate: 'Due Date', + OCR: 'OCR Number', + Amount: 'Amount', + Bankgiro: 'Bankgiro', + Plusgiro: 'Plusgiro', + supplier_org_number: 'Supplier Org Number', + customer_number: 'Customer Number', + payment_line: 'Payment Line', + } + return fieldNames[field] || field + } + + return ( +
+ {/* Header */} +
+

+ Invoice Extraction Demo +

+

+ Upload a Swedish invoice to see our AI-powered field extraction in action +

+
+ + {/* Upload Area */} + {!result && ( +
+
+
{ + e.preventDefault() + setIsDragging(true) + }} + onDragLeave={() => setIsDragging(false)} + onDrop={handleDrop} + onClick={handleBrowseClick} + > +
+ {isProcessing ? ( + <> + +
+

+ Processing invoice... +

+

+ This may take a few moments +

+
+ + ) : selectedFile ? ( + <> +
+ +
+
+

+ {selectedFile.name} +

+

+ {(selectedFile.size / 1024 / 1024).toFixed(2)} MB +

+
+ + ) : ( + <> +
+ +
+
+

+ Drag & drop invoice here +

+

+ or{' '} + + browse files + +

+

+ Supports PDF, PNG, JPG (up to 50MB) +

+
+ + )} +
+
+ + handleFileSelect(e.target.files?.[0] || null)} + /> + + {error && ( +
+ + {error} +
+ )} + + {selectedFile && !isProcessing && ( +
+ + +
+ )} +
+
+ )} + + {/* Results */} + {result && ( +
+ {/* Status Header */} +
+
+
+ {result.success ? ( +
+ +
+ ) : ( +
+ +
+ )} +
+

+ {result.success ? 'Extraction Complete' : 'Partial Results'} +

+

+ Document ID: {result.document_id} +

+
+
+ +
+ +
+
+ + + {result.processing_time_ms.toFixed(0)}ms + +
+ {result.fallback_used && ( + + Fallback OCR Used + + )} +
+
+ + {/* Main Content Grid */} +
+ {/* Left Column: Extracted Fields */} +
+
+

+ + Extracted Fields +

+
+ {Object.entries(result.fields).map(([field, value]) => { + const confidence = result.confidence[field] + return ( +
+
+ {formatFieldName(field)} +
+
+ {value || N/A} +
+ {confidence && ( +
+ + {(confidence * 100).toFixed(1)}% +
+ )} +
+ ) + })} +
+
+ + {/* Visualization */} + {result.visualization_url && ( +
+

+ + Detection Visualization +

+
+ Detection visualization +
+
+ )} +
+ + {/* Right Column: Cross-Validation & Errors */} +
+ {/* Cross-Validation */} + {result.cross_validation && ( +
+

+ + Payment Line Validation +

+ +
+ {result.cross_validation.is_valid ? ( + <> + + All Fields Match + + ) : ( + <> + + Mismatch Detected + + )} +
+ +
+ {result.cross_validation.payment_line_ocr && ( +
+
+
+
+ OCR NUMBER +
+
+ {result.cross_validation.payment_line_ocr} +
+
+ {result.cross_validation.ocr_match === true && ( + + )} + {result.cross_validation.ocr_match === false && ( + + )} +
+
+ )} + + {result.cross_validation.payment_line_amount && ( +
+
+
+
+ AMOUNT +
+
+ {result.cross_validation.payment_line_amount} +
+
+ {result.cross_validation.amount_match === true && ( + + )} + {result.cross_validation.amount_match === false && ( + + )} +
+
+ )} + + {result.cross_validation.payment_line_account && ( +
+
+
+
+ {result.cross_validation.payment_line_account_type === 'bankgiro' + ? 'BANKGIRO' + : 'PLUSGIRO'} +
+
+ {result.cross_validation.payment_line_account} +
+
+ {(result.cross_validation.payment_line_account_type === 'bankgiro' + ? result.cross_validation.bankgiro_match + : result.cross_validation.plusgiro_match) === true && ( + + )} + {(result.cross_validation.payment_line_account_type === 'bankgiro' + ? result.cross_validation.bankgiro_match + : result.cross_validation.plusgiro_match) === false && ( + + )} +
+
+ )} +
+ + {result.cross_validation.details.length > 0 && ( +
+ {result.cross_validation.details[result.cross_validation.details.length - 1]} +
+ )} +
+ )} + + {/* Errors */} + {result.errors.length > 0 && ( +
+

+ + Issues +

+
+ {result.errors.map((err, idx) => ( +
+ + {err} +
+ ))} +
+
+ )} +
+
+
+ )} +
+ ) +} diff --git a/frontend/src/components/Layout.tsx b/frontend/src/components/Layout.tsx new file mode 100644 index 0000000..d5ed60c --- /dev/null +++ b/frontend/src/components/Layout.tsx @@ -0,0 +1,102 @@ +import React, { useState } from 'react'; +import { Box, LayoutTemplate, Users, BookOpen, LogOut, Sparkles } from 'lucide-react'; + +interface LayoutProps { + children: React.ReactNode; + activeView: string; + onNavigate: (view: string) => void; + onLogout?: () => void; +} + +export const Layout: React.FC = ({ children, activeView, onNavigate, onLogout }) => { + const [showDropdown, setShowDropdown] = useState(false); + const navItems = [ + { id: 'dashboard', label: 'Dashboard', icon: LayoutTemplate }, + { id: 'demo', label: 'Demo', icon: Sparkles }, + { id: 'training', label: 'Training', icon: Box }, // Mapped to Compliants visually in prompt, using logical name + { id: 'documents', label: 'Documents', icon: BookOpen }, + { id: 'models', label: 'Models', icon: Users }, // Contacts in prompt, mapped to models for this use case + ]; + + return ( +
+ {/* Top Navigation */} + + + {/* Main Content */} +
+ {children} +
+
+ ); +}; \ No newline at end of file diff --git a/frontend/src/components/Login.tsx b/frontend/src/components/Login.tsx new file mode 100644 index 0000000..2f56bab --- /dev/null +++ b/frontend/src/components/Login.tsx @@ -0,0 +1,188 @@ +import React, { useState } from 'react' +import { Button } from './Button' + +interface LoginProps { + onLogin: (token: string) => void +} + +export const Login: React.FC = ({ onLogin }) => { + const [token, setToken] = useState('') + const [name, setName] = useState('') + const [description, setDescription] = useState('') + const [isCreating, setIsCreating] = useState(false) + const [error, setError] = useState('') + const [createdToken, setCreatedToken] = useState('') + + const handleLoginWithToken = () => { + if (!token.trim()) { + setError('Please enter a token') + return + } + localStorage.setItem('admin_token', token.trim()) + onLogin(token.trim()) + } + + const handleCreateToken = async () => { + if (!name.trim()) { + setError('Please enter a token name') + return + } + + setIsCreating(true) + setError('') + + try { + const response = await fetch('http://localhost:8000/api/v1/admin/auth/token', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + name: name.trim(), + description: description.trim() || undefined, + }), + }) + + if (!response.ok) { + throw new Error('Failed to create token') + } + + const data = await response.json() + setCreatedToken(data.token) + setToken(data.token) + setError('') + } catch (err) { + setError('Failed to create token. Please check your connection.') + console.error(err) + } finally { + setIsCreating(false) + } + } + + const handleUseCreatedToken = () => { + if (createdToken) { + localStorage.setItem('admin_token', createdToken) + onLogin(createdToken) + } + } + + return ( +
+
+

+ Admin Authentication +

+

+ Sign in with an admin token to access the document management system +

+ + {error && ( +
+ {error} +
+ )} + + {createdToken && ( +
+

Token created successfully!

+
+ + {createdToken} + +
+

+ Save this token securely. You won't be able to see it again. +

+ +
+ )} + +
+ {/* Login with existing token */} +
+

+ Sign in with existing token +

+
+
+ + setToken(e.target.value)} + placeholder="Enter your admin token" + className="w-full px-3 py-2 border border-warm-border rounded-md text-sm focus:outline-none focus:ring-1 focus:ring-warm-state-info font-mono" + onKeyDown={(e) => e.key === 'Enter' && handleLoginWithToken()} + /> +
+ +
+
+ +
+
+
+
+
+ OR +
+
+ + {/* Create new token */} +
+

+ Create new admin token +

+
+
+ + setName(e.target.value)} + placeholder="e.g., my-laptop" + className="w-full px-3 py-2 border border-warm-border rounded-md text-sm focus:outline-none focus:ring-1 focus:ring-warm-state-info" + /> +
+
+ + setDescription(e.target.value)} + placeholder="e.g., Personal laptop access" + className="w-full px-3 py-2 border border-warm-border rounded-md text-sm focus:outline-none focus:ring-1 focus:ring-warm-state-info" + /> +
+ +
+
+
+ +
+

+ Admin tokens are used to authenticate with the document management API. + Keep your tokens secure and never share them. +

+
+
+
+ ) +} diff --git a/frontend/src/components/Models.tsx b/frontend/src/components/Models.tsx new file mode 100644 index 0000000..c35052f --- /dev/null +++ b/frontend/src/components/Models.tsx @@ -0,0 +1,134 @@ +import React from 'react'; +import { BarChart, Bar, XAxis, YAxis, CartesianGrid, Tooltip, ResponsiveContainer } from 'recharts'; +import { Button } from './Button'; + +const CHART_DATA = [ + { name: 'Model A', value: 75 }, + { name: 'Model B', value: 82 }, + { name: 'Model C', value: 95 }, + { name: 'Model D', value: 68 }, +]; + +const METRICS_DATA = [ + { name: 'Precision', value: 88 }, + { name: 'Recall', value: 76 }, + { name: 'F1 Score', value: 91 }, + { name: 'Accuracy', value: 82 }, +]; + +const JOBS = [ + { id: 1, name: 'Training Job Job 1', date: '12/29/2024 10:33 PM', status: 'Running', progress: 65 }, + { id: 2, name: 'Training Job 2', date: '12/29/2024 10:33 PM', status: 'Completed', success: 37, metrics: 89 }, + { id: 3, name: 'Model Training Compentr 1', date: '12/29/2024 10:19 PM', status: 'Completed', success: 87, metrics: 92 }, +]; + +export const Models: React.FC = () => { + return ( +
+ {/* Left: Job History */} +
+

Models & History

+

Recent Training Jobs

+ +
+ {JOBS.map(job => ( +
+
+
+

{job.name}

+

Started {job.date}

+
+ + {job.status} + +
+ + {job.status === 'Running' ? ( +
+
+
+
+
+ ) : ( +
+
+ Success + {job.success} +
+
+ Performance + {job.metrics}% +
+
+ Completed + 100% +
+
+ )} +
+ ))} +
+
+ + {/* Right: Model Detail */} +
+
+
+

Model Detail

+ Completed +
+ +
+

Model name

+

Invoices Q4 v2.1

+
+ +
+ {/* Chart 1 */} +
+

Bar Rate Metrics

+
+ + + + + + + + + +
+
+ + {/* Chart 2 */} +
+

Entity Extraction Accuracy

+
+ + + + + + + + + +
+
+
+ +
+ +
+ + +
+
+
+
+
+ ); +}; \ No newline at end of file diff --git a/frontend/src/components/Training.tsx b/frontend/src/components/Training.tsx new file mode 100644 index 0000000..39a9976 --- /dev/null +++ b/frontend/src/components/Training.tsx @@ -0,0 +1,113 @@ +import React, { useState } from 'react'; +import { Check, AlertCircle } from 'lucide-react'; +import { Button } from './Button'; +import { DocumentStatus } from '../types'; + +export const Training: React.FC = () => { + const [split, setSplit] = useState(80); + + const docs = [ + { id: '1', name: 'Document Document 1', date: '12/28/2024', status: DocumentStatus.VERIFIED }, + { id: '2', name: 'Document Document 2', date: '12/29/2024', status: DocumentStatus.VERIFIED }, + { id: '3', name: 'Document Document 3', date: '12/29/2024', status: DocumentStatus.VERIFIED }, + { id: '4', name: 'Document Document 4', date: '12/29/2024', status: DocumentStatus.PARTIAL }, + { id: '5', name: 'Document Document 5', date: '12/29/2024', status: DocumentStatus.PARTIAL }, + { id: '6', name: 'Document Document 6', date: '12/29/2024', status: DocumentStatus.PARTIAL }, + { id: '8', name: 'Document Document 8', date: '12/29/2024', status: DocumentStatus.VERIFIED }, + ]; + + return ( +
+ {/* Document Selection List */} +
+

Document Selection

+ +
+
+ + + + + + + + + + + {docs.map(doc => ( + + + + + + + ))} + +
Document nameDateStatus
{doc.name}{doc.date} + {doc.status === DocumentStatus.VERIFIED ? ( +
+
+ +
+ Verified +
+ ) : ( +
+
+ ! +
+ Partial +
+ )} +
+
+
+
+ + {/* Configuration Panel */} +
+
+

Training Configuration

+ +
+
+ + +
+ +
+ + +
+ +
+
+ + {split}% / {100-split}% +
+ setSplit(parseInt(e.target.value))} + className="w-full h-1.5 bg-warm-border rounded-lg appearance-none cursor-pointer accent-warm-state-info" + /> +
+ +
+ +
+
+
+
+
+ ); +}; \ No newline at end of file diff --git a/frontend/src/components/UploadModal.tsx b/frontend/src/components/UploadModal.tsx new file mode 100644 index 0000000..ca7b9b3 --- /dev/null +++ b/frontend/src/components/UploadModal.tsx @@ -0,0 +1,210 @@ +import React, { useState, useRef } from 'react' +import { X, UploadCloud, File, CheckCircle, AlertCircle } from 'lucide-react' +import { Button } from './Button' +import { useDocuments } from '../hooks/useDocuments' + +interface UploadModalProps { + isOpen: boolean + onClose: () => void +} + +export const UploadModal: React.FC = ({ isOpen, onClose }) => { + const [isDragging, setIsDragging] = useState(false) + const [selectedFiles, setSelectedFiles] = useState([]) + const [uploadStatus, setUploadStatus] = useState<'idle' | 'uploading' | 'success' | 'error'>('idle') + const [errorMessage, setErrorMessage] = useState('') + const fileInputRef = useRef(null) + + const { uploadDocument, isUploading } = useDocuments({}) + + if (!isOpen) return null + + const handleFileSelect = (files: FileList | null) => { + if (!files) return + + const pdfFiles = Array.from(files).filter(file => { + const isPdf = file.type === 'application/pdf' + const isImage = file.type.startsWith('image/') + const isUnder25MB = file.size <= 25 * 1024 * 1024 + return (isPdf || isImage) && isUnder25MB + }) + + setSelectedFiles(prev => [...prev, ...pdfFiles]) + setUploadStatus('idle') + setErrorMessage('') + } + + const handleDrop = (e: React.DragEvent) => { + e.preventDefault() + setIsDragging(false) + handleFileSelect(e.dataTransfer.files) + } + + const handleBrowseClick = () => { + fileInputRef.current?.click() + } + + const removeFile = (index: number) => { + setSelectedFiles(prev => prev.filter((_, i) => i !== index)) + } + + const handleUpload = async () => { + if (selectedFiles.length === 0) { + setErrorMessage('Please select at least one file') + return + } + + setUploadStatus('uploading') + setErrorMessage('') + + try { + // Upload files one by one + for (const file of selectedFiles) { + await new Promise((resolve, reject) => { + uploadDocument(file, { + onSuccess: () => resolve(), + onError: (error: Error) => reject(error), + }) + }) + } + + setUploadStatus('success') + setTimeout(() => { + onClose() + setSelectedFiles([]) + setUploadStatus('idle') + }, 1500) + } catch (error) { + setUploadStatus('error') + setErrorMessage(error instanceof Error ? error.message : 'Upload failed') + } + } + + const handleClose = () => { + if (uploadStatus === 'uploading') { + return // Prevent closing during upload + } + setSelectedFiles([]) + setUploadStatus('idle') + setErrorMessage('') + onClose() + } + + return ( +
+
e.stopPropagation()} + > +
+

Upload Documents

+ +
+ + {/* Drop Zone */} +
{ e.preventDefault(); setIsDragging(true); }} + onDragLeave={() => setIsDragging(false)} + onDrop={handleDrop} + onClick={handleBrowseClick} + > +
+ +
+
+

+ Drag & drop files here or Browse +

+

PDF, JPG, PNG up to 25MB

+
+
+ + handleFileSelect(e.target.files)} + /> + + {/* Selected Files */} + {selectedFiles.length > 0 && ( +
+

+ Selected Files ({selectedFiles.length}) +

+
+ {selectedFiles.map((file, index) => ( +
+
+ + + {file.name} + + + ({(file.size / 1024 / 1024).toFixed(2)} MB) + +
+ +
+ ))} +
+
+ )} + + {/* Status Messages */} + {uploadStatus === 'success' && ( +
+ + Upload successful! +
+ )} + + {uploadStatus === 'error' && errorMessage && ( +
+ + {errorMessage} +
+ )} + + {/* Actions */} +
+ + +
+
+
+ ) +} diff --git a/frontend/src/hooks/index.ts b/frontend/src/hooks/index.ts new file mode 100644 index 0000000..d642394 --- /dev/null +++ b/frontend/src/hooks/index.ts @@ -0,0 +1,4 @@ +export { useDocuments } from './useDocuments' +export { useDocumentDetail } from './useDocumentDetail' +export { useAnnotations } from './useAnnotations' +export { useTraining, useTrainingDocuments } from './useTraining' diff --git a/frontend/src/hooks/useAnnotations.ts b/frontend/src/hooks/useAnnotations.ts new file mode 100644 index 0000000..b6e0e25 --- /dev/null +++ b/frontend/src/hooks/useAnnotations.ts @@ -0,0 +1,70 @@ +import { useMutation, useQueryClient } from '@tanstack/react-query' +import { annotationsApi } from '../api/endpoints' +import type { CreateAnnotationRequest, AnnotationOverrideRequest } from '../api/types' + +export const useAnnotations = (documentId: string) => { + const queryClient = useQueryClient() + + const createMutation = useMutation({ + mutationFn: (annotation: CreateAnnotationRequest) => + annotationsApi.create(documentId, annotation), + onSuccess: () => { + queryClient.invalidateQueries({ queryKey: ['document', documentId] }) + }, + }) + + const updateMutation = useMutation({ + mutationFn: ({ + annotationId, + updates, + }: { + annotationId: string + updates: Partial + }) => annotationsApi.update(documentId, annotationId, updates), + onSuccess: () => { + queryClient.invalidateQueries({ queryKey: ['document', documentId] }) + }, + }) + + const deleteMutation = useMutation({ + mutationFn: (annotationId: string) => + annotationsApi.delete(documentId, annotationId), + onSuccess: () => { + queryClient.invalidateQueries({ queryKey: ['document', documentId] }) + }, + }) + + const verifyMutation = useMutation({ + mutationFn: (annotationId: string) => + annotationsApi.verify(documentId, annotationId), + onSuccess: () => { + queryClient.invalidateQueries({ queryKey: ['document', documentId] }) + }, + }) + + const overrideMutation = useMutation({ + mutationFn: ({ + annotationId, + overrideData, + }: { + annotationId: string + overrideData: AnnotationOverrideRequest + }) => annotationsApi.override(documentId, annotationId, overrideData), + onSuccess: () => { + queryClient.invalidateQueries({ queryKey: ['document', documentId] }) + }, + }) + + return { + createAnnotation: createMutation.mutate, + isCreating: createMutation.isPending, + updateAnnotation: updateMutation.mutate, + isUpdating: updateMutation.isPending, + deleteAnnotation: deleteMutation.mutate, + isDeleting: deleteMutation.isPending, + verifyAnnotation: verifyMutation.mutate, + isVerifying: verifyMutation.isPending, + overrideAnnotation: overrideMutation.mutate, + isOverriding: overrideMutation.isPending, + } +} diff --git a/frontend/src/hooks/useDocumentDetail.ts b/frontend/src/hooks/useDocumentDetail.ts new file mode 100644 index 0000000..059e476 --- /dev/null +++ b/frontend/src/hooks/useDocumentDetail.ts @@ -0,0 +1,25 @@ +import { useQuery } from '@tanstack/react-query' +import { documentsApi } from '../api/endpoints' +import type { DocumentDetailResponse } from '../api/types' + +export const useDocumentDetail = (documentId: string | null) => { + const { data, isLoading, error, refetch } = useQuery({ + queryKey: ['document', documentId], + queryFn: () => { + if (!documentId) { + throw new Error('Document ID is required') + } + return documentsApi.getDetail(documentId) + }, + enabled: !!documentId, + staleTime: 10000, + }) + + return { + document: data || null, + annotations: data?.annotations || [], + isLoading, + error, + refetch, + } +} diff --git a/frontend/src/hooks/useDocuments.ts b/frontend/src/hooks/useDocuments.ts new file mode 100644 index 0000000..22e07c1 --- /dev/null +++ b/frontend/src/hooks/useDocuments.ts @@ -0,0 +1,78 @@ +import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query' +import { documentsApi } from '../api/endpoints' +import type { DocumentListResponse, UploadDocumentResponse } from '../api/types' + +interface UseDocumentsParams { + status?: string + limit?: number + offset?: number +} + +export const useDocuments = (params: UseDocumentsParams = {}) => { + const queryClient = useQueryClient() + + const { data, isLoading, error, refetch } = useQuery({ + queryKey: ['documents', params], + queryFn: () => documentsApi.list(params), + staleTime: 30000, + }) + + const uploadMutation = useMutation({ + mutationFn: (file: File) => documentsApi.upload(file), + onSuccess: () => { + queryClient.invalidateQueries({ queryKey: ['documents'] }) + }, + }) + + const batchUploadMutation = useMutation({ + mutationFn: ({ files, csvFile }: { files: File[]; csvFile?: File }) => + documentsApi.batchUpload(files, csvFile), + onSuccess: () => { + queryClient.invalidateQueries({ queryKey: ['documents'] }) + }, + }) + + const deleteMutation = useMutation({ + mutationFn: (documentId: string) => documentsApi.delete(documentId), + onSuccess: () => { + queryClient.invalidateQueries({ queryKey: ['documents'] }) + }, + }) + + const updateStatusMutation = useMutation({ + mutationFn: ({ documentId, status }: { documentId: string; status: string }) => + documentsApi.updateStatus(documentId, status), + onSuccess: () => { + queryClient.invalidateQueries({ queryKey: ['documents'] }) + }, + }) + + const triggerAutoLabelMutation = useMutation({ + mutationFn: (documentId: string) => documentsApi.triggerAutoLabel(documentId), + onSuccess: () => { + queryClient.invalidateQueries({ queryKey: ['documents'] }) + }, + }) + + return { + documents: data?.documents || [], + total: data?.total || 0, + limit: data?.limit || params.limit || 20, + offset: data?.offset || params.offset || 0, + isLoading, + error, + refetch, + uploadDocument: uploadMutation.mutate, + uploadDocumentAsync: uploadMutation.mutateAsync, + isUploading: uploadMutation.isPending, + batchUpload: batchUploadMutation.mutate, + batchUploadAsync: batchUploadMutation.mutateAsync, + isBatchUploading: batchUploadMutation.isPending, + deleteDocument: deleteMutation.mutate, + isDeleting: deleteMutation.isPending, + updateStatus: updateStatusMutation.mutate, + isUpdatingStatus: updateStatusMutation.isPending, + triggerAutoLabel: triggerAutoLabelMutation.mutate, + isTriggeringAutoLabel: triggerAutoLabelMutation.isPending, + } +} diff --git a/frontend/src/hooks/useTraining.ts b/frontend/src/hooks/useTraining.ts new file mode 100644 index 0000000..6df4c80 --- /dev/null +++ b/frontend/src/hooks/useTraining.ts @@ -0,0 +1,83 @@ +import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query' +import { trainingApi } from '../api/endpoints' +import type { TrainingModelsResponse } from '../api/types' + +export const useTraining = () => { + const queryClient = useQueryClient() + + const { data: modelsData, isLoading: isLoadingModels } = + useQuery({ + queryKey: ['training', 'models'], + queryFn: () => trainingApi.getModels(), + staleTime: 30000, + }) + + const startTrainingMutation = useMutation({ + mutationFn: (config: { + name: string + description?: string + document_ids: string[] + epochs?: number + batch_size?: number + model_base?: string + }) => trainingApi.startTraining(config), + onSuccess: () => { + queryClient.invalidateQueries({ queryKey: ['training', 'models'] }) + }, + }) + + const cancelTaskMutation = useMutation({ + mutationFn: (taskId: string) => trainingApi.cancelTask(taskId), + onSuccess: () => { + queryClient.invalidateQueries({ queryKey: ['training', 'models'] }) + }, + }) + + const downloadModelMutation = useMutation({ + mutationFn: (taskId: string) => trainingApi.downloadModel(taskId), + onSuccess: (blob, taskId) => { + const url = window.URL.createObjectURL(blob) + const a = document.createElement('a') + a.href = url + a.download = `model-${taskId}.pt` + document.body.appendChild(a) + a.click() + window.URL.revokeObjectURL(url) + document.body.removeChild(a) + }, + }) + + return { + models: modelsData?.models || [], + total: modelsData?.total || 0, + isLoadingModels, + startTraining: startTrainingMutation.mutate, + startTrainingAsync: startTrainingMutation.mutateAsync, + isStartingTraining: startTrainingMutation.isPending, + cancelTask: cancelTaskMutation.mutate, + isCancelling: cancelTaskMutation.isPending, + downloadModel: downloadModelMutation.mutate, + isDownloading: downloadModelMutation.isPending, + } +} + +export const useTrainingDocuments = (params?: { + has_annotations?: boolean + min_annotation_count?: number + exclude_used_in_training?: boolean + limit?: number + offset?: number +}) => { + const { data, isLoading, error } = useQuery({ + queryKey: ['training', 'documents', params], + queryFn: () => trainingApi.getDocumentsForTraining(params), + staleTime: 30000, + }) + + return { + documents: data?.documents || [], + total: data?.total || 0, + isLoading, + error, + } +} diff --git a/frontend/src/main.tsx b/frontend/src/main.tsx new file mode 100644 index 0000000..7159fb6 --- /dev/null +++ b/frontend/src/main.tsx @@ -0,0 +1,23 @@ +import React from 'react' +import ReactDOM from 'react-dom/client' +import { QueryClient, QueryClientProvider } from '@tanstack/react-query' +import App from './App' +import './styles/index.css' + +const queryClient = new QueryClient({ + defaultOptions: { + queries: { + retry: 1, + refetchOnWindowFocus: false, + staleTime: 30000, + }, + }, +}) + +ReactDOM.createRoot(document.getElementById('root')!).render( + + + + + +) diff --git a/frontend/src/styles/index.css b/frontend/src/styles/index.css new file mode 100644 index 0000000..39a20b1 --- /dev/null +++ b/frontend/src/styles/index.css @@ -0,0 +1,26 @@ +@tailwind base; +@tailwind components; +@tailwind utilities; + +@layer base { + body { + @apply bg-warm-bg text-warm-text-primary; + } + + /* Custom scrollbar */ + ::-webkit-scrollbar { + @apply w-2 h-2; + } + + ::-webkit-scrollbar-track { + @apply bg-transparent; + } + + ::-webkit-scrollbar-thumb { + @apply bg-warm-divider rounded; + } + + ::-webkit-scrollbar-thumb:hover { + @apply bg-warm-text-disabled; + } +} diff --git a/frontend/src/types/index.ts b/frontend/src/types/index.ts new file mode 100644 index 0000000..d843d5a --- /dev/null +++ b/frontend/src/types/index.ts @@ -0,0 +1,48 @@ +// Legacy types for backward compatibility with old components +// These will be gradually replaced with API types + +export enum DocumentStatus { + PENDING = 'Pending', + LABELED = 'Labeled', + VERIFIED = 'Verified', + PARTIAL = 'Partial' +} + +export interface Document { + id: string + name: string + date: string + status: DocumentStatus + exported: boolean + autoLabelProgress?: number + autoLabelStatus?: 'Running' | 'Completed' | 'Failed' +} + +export interface Annotation { + id: string + text: string + label: string + x: number + y: number + width: number + height: number + isAuto?: boolean +} + +export interface TrainingJob { + id: string + name: string + startDate: string + status: 'Running' | 'Completed' | 'Failed' + progress: number + metrics?: { + accuracy: number + precision: number + recall: number + } +} + +export interface ModelMetric { + name: string + value: number +} diff --git a/frontend/tailwind.config.js b/frontend/tailwind.config.js new file mode 100644 index 0000000..92dce15 --- /dev/null +++ b/frontend/tailwind.config.js @@ -0,0 +1,47 @@ +export default { + content: ['./index.html', './src/**/*.{js,ts,jsx,tsx}'], + theme: { + extend: { + fontFamily: { + sans: ['Inter', 'SF Pro', 'system-ui', 'sans-serif'], + mono: ['JetBrains Mono', 'SF Mono', 'monospace'], + }, + colors: { + warm: { + bg: '#FAFAF8', + card: '#FFFFFF', + hover: '#F1F0ED', + selected: '#ECEAE6', + border: '#E6E4E1', + divider: '#D8D6D2', + text: { + primary: '#121212', + secondary: '#2A2A2A', + muted: '#6B6B6B', + disabled: '#9A9A9A', + }, + state: { + success: '#3E4A3A', + error: '#4A3A3A', + warning: '#4A4A3A', + info: '#3A3A3A', + } + } + }, + boxShadow: { + 'card': '0 1px 3px rgba(0,0,0,0.08)', + 'modal': '0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06)', + }, + animation: { + 'fade-in': 'fadeIn 0.3s ease-out', + }, + keyframes: { + fadeIn: { + '0%': { opacity: '0', transform: 'translateY(10px)' }, + '100%': { opacity: '1', transform: 'translateY(0)' }, + } + } + } + }, + plugins: [], +} diff --git a/frontend/tsconfig.json b/frontend/tsconfig.json new file mode 100644 index 0000000..2c6eed5 --- /dev/null +++ b/frontend/tsconfig.json @@ -0,0 +1,29 @@ +{ + "compilerOptions": { + "target": "ES2022", + "experimentalDecorators": true, + "useDefineForClassFields": false, + "module": "ESNext", + "lib": [ + "ES2022", + "DOM", + "DOM.Iterable" + ], + "skipLibCheck": true, + "types": [ + "node" + ], + "moduleResolution": "bundler", + "isolatedModules": true, + "moduleDetection": "force", + "allowJs": true, + "jsx": "react-jsx", + "paths": { + "@/*": [ + "./*" + ] + }, + "allowImportingTsExtensions": true, + "noEmit": true + } +} \ No newline at end of file diff --git a/frontend/vite.config.ts b/frontend/vite.config.ts new file mode 100644 index 0000000..5bdd5b7 --- /dev/null +++ b/frontend/vite.config.ts @@ -0,0 +1,16 @@ +import { defineConfig } from 'vite'; +import react from '@vitejs/plugin-react'; + +export default defineConfig({ + server: { + port: 3000, + host: '0.0.0.0', + proxy: { + '/api': { + target: 'http://localhost:8000', + changeOrigin: true, + }, + }, + }, + plugins: [react()], +}); diff --git a/requirements.txt b/requirements.txt index d980443..2cb7eca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,3 +21,7 @@ pyyaml>=6.0 # YAML config files # Utilities tqdm>=4.65.0 # Progress bars python-dotenv>=1.0.0 # Environment variable management + +# Database +psycopg2-binary>=2.9.0 # PostgreSQL driver +sqlmodel>=0.0.22 # SQLModel ORM (SQLAlchemy + Pydantic) diff --git a/src/cli/analyze_labels.py b/src/cli/analyze_labels.py index f370fb2..55df436 100644 --- a/src/cli/analyze_labels.py +++ b/src/cli/analyze_labels.py @@ -16,7 +16,7 @@ from pathlib import Path from typing import Optional sys.path.insert(0, str(Path(__file__).parent.parent.parent)) -from config import get_db_connection_string +from src.config import get_db_connection_string from ..normalize import normalize_field from ..matcher import FieldMatcher diff --git a/src/cli/analyze_report.py b/src/cli/analyze_report.py index 011417d..366b1e4 100644 --- a/src/cli/analyze_report.py +++ b/src/cli/analyze_report.py @@ -12,7 +12,7 @@ from collections import defaultdict from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent.parent)) -from config import get_db_connection_string +from src.config import get_db_connection_string def load_reports_from_db() -> dict: diff --git a/src/cli/autolabel.py b/src/cli/autolabel.py index 045f188..7c10391 100644 --- a/src/cli/autolabel.py +++ b/src/cli/autolabel.py @@ -34,7 +34,7 @@ if sys.platform == 'win32': multiprocessing.set_start_method('spawn', force=True) sys.path.insert(0, str(Path(__file__).parent.parent.parent)) -from config import get_db_connection_string, PATHS, AUTOLABEL +from src.config import get_db_connection_string, PATHS, AUTOLABEL # Global OCR engine for worker processes (initialized once per worker) _worker_ocr_engine = None diff --git a/src/cli/import_report_to_db.py b/src/cli/import_report_to_db.py index 3afd81d..1cb058b 100644 --- a/src/cli/import_report_to_db.py +++ b/src/cli/import_report_to_db.py @@ -16,7 +16,7 @@ from psycopg2.extras import execute_values # Add project root to path sys.path.insert(0, str(Path(__file__).parent.parent.parent)) -from config import get_db_connection_string, PATHS +from src.config import get_db_connection_string, PATHS def create_tables(conn): diff --git a/src/cli/infer.py b/src/cli/infer.py index 6befa49..c4ec682 100644 --- a/src/cli/infer.py +++ b/src/cli/infer.py @@ -10,6 +10,9 @@ import json import sys from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) +from src.config import DEFAULT_DPI + def main(): parser = argparse.ArgumentParser( @@ -38,8 +41,8 @@ def main(): parser.add_argument( '--dpi', type=int, - default=150, - help='DPI for PDF rendering (default: 150, must match training)' + default=DEFAULT_DPI, + help=f'DPI for PDF rendering (default: {DEFAULT_DPI}, must match training)' ) parser.add_argument( '--no-fallback', diff --git a/src/cli/reprocess_failed.py b/src/cli/reprocess_failed.py index b2a65ff..e551317 100644 --- a/src/cli/reprocess_failed.py +++ b/src/cli/reprocess_failed.py @@ -17,6 +17,7 @@ from tqdm import tqdm sys.path.insert(0, str(Path(__file__).parent.parent.parent)) +from src.config import DEFAULT_DPI from src.data.db import DocumentDB from src.data.csv_loader import CSVLoader from src.normalize.normalizer import normalize_field @@ -144,7 +145,7 @@ def process_single_document(args): ocr_engine = OCREngine() for page_no in range(pdf_doc.page_count): # Render page to image - img = pdf_doc.render_page(page_no, dpi=150) + img = pdf_doc.render_page(page_no, dpi=DEFAULT_DPI) if img is None: continue diff --git a/src/cli/serve.py b/src/cli/serve.py index 8e26dc5..d87fff2 100644 --- a/src/cli/serve.py +++ b/src/cli/serve.py @@ -15,6 +15,8 @@ from pathlib import Path project_root = Path(__file__).parent.parent.parent sys.path.insert(0, str(project_root)) +from src.config import DEFAULT_DPI + def setup_logging(debug: bool = False) -> None: """Configure logging.""" @@ -65,8 +67,8 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--dpi", type=int, - default=150, - help="DPI for PDF rendering (must match training DPI)", + default=DEFAULT_DPI, + help=f"DPI for PDF rendering (default: {DEFAULT_DPI}, must match training DPI)", ) parser.add_argument( diff --git a/src/cli/train.py b/src/cli/train.py index a063e48..afb4ba0 100644 --- a/src/cli/train.py +++ b/src/cli/train.py @@ -11,7 +11,7 @@ import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent.parent)) -from config import PATHS +from src.config import DEFAULT_DPI, PATHS def main(): @@ -103,8 +103,8 @@ def main(): parser.add_argument( '--dpi', type=int, - default=150, - help='DPI used for rendering (default: 150, must match autolabel rendering)' + default=DEFAULT_DPI, + help=f'DPI used for rendering (default: {DEFAULT_DPI}, must match autolabel rendering)' ) parser.add_argument( '--export-only', diff --git a/config.py b/src/config.py similarity index 91% rename from config.py rename to src/config.py index e903397..6f183f5 100644 --- a/config.py +++ b/src/config.py @@ -8,9 +8,13 @@ from pathlib import Path from dotenv import load_dotenv # Load environment variables from .env file -env_path = Path(__file__).parent / '.env' +# .env is at project root, config.py is in src/ +env_path = Path(__file__).parent.parent / '.env' load_dotenv(dotenv_path=env_path) +# Global DPI setting - must match training DPI for optimal model performance +DEFAULT_DPI = 150 + def _is_wsl() -> bool: """Check if running inside WSL (Windows Subsystem for Linux).""" @@ -69,7 +73,7 @@ else: # Auto-labeling Configuration AUTOLABEL = { 'workers': 2, - 'dpi': 150, + 'dpi': DEFAULT_DPI, 'min_confidence': 0.5, 'train_ratio': 0.8, 'val_ratio': 0.1, diff --git a/src/data/admin_db.py b/src/data/admin_db.py new file mode 100644 index 0000000..f55a36f --- /dev/null +++ b/src/data/admin_db.py @@ -0,0 +1,1156 @@ +""" +Admin Database Operations + +Database interface for admin document management, annotations, and training tasks. +""" + +import logging +from datetime import datetime +from typing import Any +from uuid import UUID + +from sqlalchemy import func +from sqlmodel import select + +from src.data.database import get_session_context +from src.data.admin_models import ( + AdminToken, + AdminDocument, + AdminAnnotation, + TrainingTask, + TrainingLog, + BatchUpload, + BatchUploadFile, + TrainingDocumentLink, + AnnotationHistory, +) + +logger = logging.getLogger(__name__) + + +class AdminDB: + """Database interface for admin operations using SQLModel.""" + + # ========================================================================== + # Admin Token Operations + # ========================================================================== + + def is_valid_admin_token(self, token: str) -> bool: + """Check if admin token exists and is active.""" + with get_session_context() as session: + result = session.get(AdminToken, token) + if result is None: + return False + if not result.is_active: + return False + if result.expires_at and result.expires_at < datetime.utcnow(): + return False + return True + + def get_admin_token(self, token: str) -> AdminToken | None: + """Get admin token details.""" + with get_session_context() as session: + result = session.get(AdminToken, token) + if result: + session.expunge(result) + return result + + def create_admin_token( + self, + token: str, + name: str, + expires_at: datetime | None = None, + ) -> None: + """Create a new admin token.""" + with get_session_context() as session: + existing = session.get(AdminToken, token) + if existing: + existing.name = name + existing.expires_at = expires_at + existing.is_active = True + session.add(existing) + else: + new_token = AdminToken( + token=token, + name=name, + expires_at=expires_at, + ) + session.add(new_token) + + def update_admin_token_usage(self, token: str) -> None: + """Update admin token last used timestamp.""" + with get_session_context() as session: + admin_token = session.get(AdminToken, token) + if admin_token: + admin_token.last_used_at = datetime.utcnow() + session.add(admin_token) + + def deactivate_admin_token(self, token: str) -> bool: + """Deactivate an admin token.""" + with get_session_context() as session: + admin_token = session.get(AdminToken, token) + if admin_token: + admin_token.is_active = False + session.add(admin_token) + return True + return False + + # ========================================================================== + # Document Operations + # ========================================================================== + + def create_document( + self, + filename: str, + file_size: int, + content_type: str, + file_path: str, + page_count: int = 1, + upload_source: str = "ui", + csv_field_values: dict[str, Any] | None = None, + admin_token: str | None = None, # Deprecated, kept for compatibility + ) -> str: + """Create a new document record.""" + with get_session_context() as session: + document = AdminDocument( + filename=filename, + file_size=file_size, + content_type=content_type, + file_path=file_path, + page_count=page_count, + upload_source=upload_source, + csv_field_values=csv_field_values, + ) + session.add(document) + session.flush() + return str(document.document_id) + + def get_document(self, document_id: str) -> AdminDocument | None: + """Get a document by ID.""" + with get_session_context() as session: + result = session.get(AdminDocument, UUID(document_id)) + if result: + session.expunge(result) + return result + + def get_document_by_token( + self, + document_id: str, + admin_token: str | None = None, # Deprecated, kept for compatibility + ) -> AdminDocument | None: + """Get a document by ID. Token parameter is deprecated.""" + return self.get_document(document_id) + + def get_documents_by_token( + self, + admin_token: str | None = None, # Deprecated, kept for compatibility + status: str | None = None, + upload_source: str | None = None, + has_annotations: bool | None = None, + auto_label_status: str | None = None, + batch_id: str | None = None, + limit: int = 20, + offset: int = 0, + ) -> tuple[list[AdminDocument], int]: + """Get paginated documents with optional filters. Token parameter is deprecated.""" + with get_session_context() as session: + # Base where clause (no token filtering) + where_clauses = [] + + # Apply filters + if status: + where_clauses.append(AdminDocument.status == status) + if upload_source: + where_clauses.append(AdminDocument.upload_source == upload_source) + if auto_label_status: + where_clauses.append(AdminDocument.auto_label_status == auto_label_status) + if batch_id: + where_clauses.append(AdminDocument.batch_id == UUID(batch_id)) + + # Count query + count_stmt = select(func.count()).select_from(AdminDocument) + if where_clauses: + count_stmt = count_stmt.where(*where_clauses) + + # For has_annotations filter, we need to join with annotations + if has_annotations is not None: + from src.data.admin_models import AdminAnnotation + + if has_annotations: + # Documents WITH annotations + count_stmt = ( + count_stmt + .join(AdminAnnotation, AdminAnnotation.document_id == AdminDocument.document_id) + .group_by(AdminDocument.document_id) + ) + else: + # Documents WITHOUT annotations - use left join and filter for null + count_stmt = ( + count_stmt + .outerjoin(AdminAnnotation, AdminAnnotation.document_id == AdminDocument.document_id) + .where(AdminAnnotation.annotation_id.is_(None)) + ) + + total = session.exec(count_stmt).one() + + # Fetch query + statement = select(AdminDocument) + if where_clauses: + statement = statement.where(*where_clauses) + + # Apply has_annotations filter + if has_annotations is not None: + from src.data.admin_models import AdminAnnotation + + if has_annotations: + statement = ( + statement + .join(AdminAnnotation, AdminAnnotation.document_id == AdminDocument.document_id) + .group_by(AdminDocument.document_id) + ) + else: + statement = ( + statement + .outerjoin(AdminAnnotation, AdminAnnotation.document_id == AdminDocument.document_id) + .where(AdminAnnotation.annotation_id.is_(None)) + ) + + statement = statement.order_by(AdminDocument.created_at.desc()) + statement = statement.offset(offset).limit(limit) + + results = session.exec(statement).all() + for r in results: + session.expunge(r) + return list(results), total + + def update_document_status( + self, + document_id: str, + status: str, + auto_label_status: str | None = None, + auto_label_error: str | None = None, + ) -> None: + """Update document status.""" + with get_session_context() as session: + document = session.get(AdminDocument, UUID(document_id)) + if document: + document.status = status + document.updated_at = datetime.utcnow() + if auto_label_status is not None: + document.auto_label_status = auto_label_status + if auto_label_error is not None: + document.auto_label_error = auto_label_error + session.add(document) + + def update_document_file_path(self, document_id: str, file_path: str) -> None: + """Update document file path.""" + with get_session_context() as session: + document = session.get(AdminDocument, UUID(document_id)) + if document: + document.file_path = file_path + document.updated_at = datetime.utcnow() + session.add(document) + + def delete_document(self, document_id: str) -> bool: + """Delete a document and its annotations.""" + with get_session_context() as session: + document = session.get(AdminDocument, UUID(document_id)) + if document: + # Delete annotations first + ann_stmt = select(AdminAnnotation).where( + AdminAnnotation.document_id == UUID(document_id) + ) + annotations = session.exec(ann_stmt).all() + for ann in annotations: + session.delete(ann) + session.delete(document) + return True + return False + + # ========================================================================== + # Annotation Operations + # ========================================================================== + + def create_annotation( + self, + document_id: str, + page_number: int, + class_id: int, + class_name: str, + x_center: float, + y_center: float, + width: float, + height: float, + bbox_x: int, + bbox_y: int, + bbox_width: int, + bbox_height: int, + text_value: str | None = None, + confidence: float | None = None, + source: str = "manual", + ) -> str: + """Create a new annotation.""" + with get_session_context() as session: + annotation = AdminAnnotation( + document_id=UUID(document_id), + page_number=page_number, + class_id=class_id, + class_name=class_name, + x_center=x_center, + y_center=y_center, + width=width, + height=height, + bbox_x=bbox_x, + bbox_y=bbox_y, + bbox_width=bbox_width, + bbox_height=bbox_height, + text_value=text_value, + confidence=confidence, + source=source, + ) + session.add(annotation) + session.flush() + return str(annotation.annotation_id) + + def create_annotations_batch( + self, + annotations: list[dict[str, Any]], + ) -> list[str]: + """Create multiple annotations in a batch.""" + with get_session_context() as session: + ids = [] + for ann_data in annotations: + annotation = AdminAnnotation( + document_id=UUID(ann_data["document_id"]), + page_number=ann_data.get("page_number", 1), + class_id=ann_data["class_id"], + class_name=ann_data["class_name"], + x_center=ann_data["x_center"], + y_center=ann_data["y_center"], + width=ann_data["width"], + height=ann_data["height"], + bbox_x=ann_data["bbox_x"], + bbox_y=ann_data["bbox_y"], + bbox_width=ann_data["bbox_width"], + bbox_height=ann_data["bbox_height"], + text_value=ann_data.get("text_value"), + confidence=ann_data.get("confidence"), + source=ann_data.get("source", "auto"), + ) + session.add(annotation) + session.flush() + ids.append(str(annotation.annotation_id)) + return ids + + def get_annotation(self, annotation_id: str) -> AdminAnnotation | None: + """Get an annotation by ID.""" + with get_session_context() as session: + result = session.get(AdminAnnotation, UUID(annotation_id)) + if result: + session.expunge(result) + return result + + def get_annotations_for_document( + self, + document_id: str, + page_number: int | None = None, + ) -> list[AdminAnnotation]: + """Get all annotations for a document.""" + with get_session_context() as session: + statement = select(AdminAnnotation).where( + AdminAnnotation.document_id == UUID(document_id) + ) + if page_number is not None: + statement = statement.where(AdminAnnotation.page_number == page_number) + statement = statement.order_by(AdminAnnotation.class_id) + + results = session.exec(statement).all() + for r in results: + session.expunge(r) + return list(results) + + def update_annotation( + self, + annotation_id: str, + x_center: float | None = None, + y_center: float | None = None, + width: float | None = None, + height: float | None = None, + bbox_x: int | None = None, + bbox_y: int | None = None, + bbox_width: int | None = None, + bbox_height: int | None = None, + text_value: str | None = None, + class_id: int | None = None, + class_name: str | None = None, + ) -> bool: + """Update an annotation.""" + with get_session_context() as session: + annotation = session.get(AdminAnnotation, UUID(annotation_id)) + if annotation: + if x_center is not None: + annotation.x_center = x_center + if y_center is not None: + annotation.y_center = y_center + if width is not None: + annotation.width = width + if height is not None: + annotation.height = height + if bbox_x is not None: + annotation.bbox_x = bbox_x + if bbox_y is not None: + annotation.bbox_y = bbox_y + if bbox_width is not None: + annotation.bbox_width = bbox_width + if bbox_height is not None: + annotation.bbox_height = bbox_height + if text_value is not None: + annotation.text_value = text_value + if class_id is not None: + annotation.class_id = class_id + if class_name is not None: + annotation.class_name = class_name + annotation.updated_at = datetime.utcnow() + session.add(annotation) + return True + return False + + def delete_annotation(self, annotation_id: str) -> bool: + """Delete an annotation.""" + with get_session_context() as session: + annotation = session.get(AdminAnnotation, UUID(annotation_id)) + if annotation: + session.delete(annotation) + return True + return False + + def delete_annotations_for_document( + self, + document_id: str, + source: str | None = None, + ) -> int: + """Delete all annotations for a document. Returns count deleted.""" + with get_session_context() as session: + statement = select(AdminAnnotation).where( + AdminAnnotation.document_id == UUID(document_id) + ) + if source: + statement = statement.where(AdminAnnotation.source == source) + annotations = session.exec(statement).all() + count = len(annotations) + for ann in annotations: + session.delete(ann) + return count + + # ========================================================================== + # Training Task Operations + # ========================================================================== + + def create_training_task( + self, + admin_token: str, + name: str, + task_type: str = "train", + description: str | None = None, + config: dict[str, Any] | None = None, + scheduled_at: datetime | None = None, + cron_expression: str | None = None, + is_recurring: bool = False, + ) -> str: + """Create a new training task.""" + with get_session_context() as session: + task = TrainingTask( + admin_token=admin_token, + name=name, + task_type=task_type, + description=description, + config=config, + scheduled_at=scheduled_at, + cron_expression=cron_expression, + is_recurring=is_recurring, + status="scheduled" if scheduled_at else "pending", + ) + session.add(task) + session.flush() + return str(task.task_id) + + def get_training_task(self, task_id: str) -> TrainingTask | None: + """Get a training task by ID.""" + with get_session_context() as session: + result = session.get(TrainingTask, UUID(task_id)) + if result: + session.expunge(result) + return result + + def get_training_task_by_token( + self, + task_id: str, + admin_token: str | None = None, # Deprecated, kept for compatibility + ) -> TrainingTask | None: + """Get a training task by ID. Token parameter is deprecated.""" + return self.get_training_task(task_id) + + def get_training_tasks_by_token( + self, + admin_token: str | None = None, # Deprecated, kept for compatibility + status: str | None = None, + limit: int = 20, + offset: int = 0, + ) -> tuple[list[TrainingTask], int]: + """Get paginated training tasks. Token parameter is deprecated.""" + with get_session_context() as session: + # Count query (no token filtering) + count_stmt = select(func.count()).select_from(TrainingTask) + if status: + count_stmt = count_stmt.where(TrainingTask.status == status) + total = session.exec(count_stmt).one() + + # Fetch query (no token filtering) + statement = select(TrainingTask) + if status: + statement = statement.where(TrainingTask.status == status) + statement = statement.order_by(TrainingTask.created_at.desc()) + statement = statement.offset(offset).limit(limit) + + results = session.exec(statement).all() + for r in results: + session.expunge(r) + return list(results), total + + def get_pending_training_tasks(self) -> list[TrainingTask]: + """Get pending training tasks ready to run.""" + with get_session_context() as session: + now = datetime.utcnow() + statement = select(TrainingTask).where( + TrainingTask.status.in_(["pending", "scheduled"]), + (TrainingTask.scheduled_at == None) | (TrainingTask.scheduled_at <= now), + ).order_by(TrainingTask.created_at) + + results = session.exec(statement).all() + for r in results: + session.expunge(r) + return list(results) + + def update_training_task_status( + self, + task_id: str, + status: str, + error_message: str | None = None, + result_metrics: dict[str, Any] | None = None, + model_path: str | None = None, + ) -> None: + """Update training task status.""" + with get_session_context() as session: + task = session.get(TrainingTask, UUID(task_id)) + if task: + task.status = status + task.updated_at = datetime.utcnow() + if status == "running": + task.started_at = datetime.utcnow() + elif status in ("completed", "failed"): + task.completed_at = datetime.utcnow() + if error_message is not None: + task.error_message = error_message + if result_metrics is not None: + task.result_metrics = result_metrics + if model_path is not None: + task.model_path = model_path + session.add(task) + + def cancel_training_task(self, task_id: str) -> bool: + """Cancel a training task.""" + with get_session_context() as session: + task = session.get(TrainingTask, UUID(task_id)) + if task and task.status in ("pending", "scheduled"): + task.status = "cancelled" + task.updated_at = datetime.utcnow() + session.add(task) + return True + return False + + # ========================================================================== + # Training Log Operations + # ========================================================================== + + def add_training_log( + self, + task_id: str, + level: str, + message: str, + details: dict[str, Any] | None = None, + ) -> None: + """Add a training log entry.""" + with get_session_context() as session: + log = TrainingLog( + task_id=UUID(task_id), + level=level, + message=message, + details=details, + ) + session.add(log) + + def get_training_logs( + self, + task_id: str, + limit: int = 100, + offset: int = 0, + ) -> list[TrainingLog]: + """Get training logs for a task.""" + with get_session_context() as session: + statement = select(TrainingLog).where( + TrainingLog.task_id == UUID(task_id) + ).order_by(TrainingLog.created_at.desc()).offset(offset).limit(limit) + + results = session.exec(statement).all() + for r in results: + session.expunge(r) + return list(results) + + # ========================================================================== + # Export Operations + # ========================================================================== + + def get_labeled_documents_for_export( + self, + admin_token: str | None = None, + ) -> list[AdminDocument]: + """Get all labeled documents ready for export.""" + with get_session_context() as session: + statement = select(AdminDocument).where( + AdminDocument.status == "labeled" + ) + if admin_token: + statement = statement.where(AdminDocument.admin_token == admin_token) + statement = statement.order_by(AdminDocument.created_at) + + results = session.exec(statement).all() + for r in results: + session.expunge(r) + return list(results) + + def count_documents_by_status( + self, + admin_token: str | None = None, # Deprecated, kept for compatibility + ) -> dict[str, int]: + """Count documents by status. Token parameter is deprecated.""" + with get_session_context() as session: + statement = select( + AdminDocument.status, + func.count(AdminDocument.document_id), + ).group_by(AdminDocument.status) + # No longer filter by token + + results = session.exec(statement).all() + return {status: count for status, count in results} + + # ========================================================================== + # Batch Upload Operations (v2) + # ========================================================================== + + def create_batch_upload( + self, + admin_token: str, + filename: str, + file_size: int, + upload_source: str = "ui", + ) -> BatchUpload: + """Create a new batch upload record.""" + with get_session_context() as session: + batch = BatchUpload( + admin_token=admin_token, + filename=filename, + file_size=file_size, + upload_source=upload_source, + ) + session.add(batch) + session.commit() + session.refresh(batch) + session.expunge(batch) + return batch + + def get_batch_upload(self, batch_id: UUID) -> BatchUpload | None: + """Get batch upload by ID.""" + with get_session_context() as session: + result = session.get(BatchUpload, batch_id) + if result: + session.expunge(result) + return result + + def update_batch_upload( + self, + batch_id: UUID, + **kwargs: Any, + ) -> None: + """Update batch upload fields.""" + with get_session_context() as session: + batch = session.get(BatchUpload, batch_id) + if batch: + for key, value in kwargs.items(): + if hasattr(batch, key): + setattr(batch, key, value) + session.add(batch) + + def create_batch_upload_file( + self, + batch_id: UUID, + filename: str, + **kwargs: Any, + ) -> BatchUploadFile: + """Create a batch upload file record.""" + with get_session_context() as session: + file_record = BatchUploadFile( + batch_id=batch_id, + filename=filename, + **kwargs, + ) + session.add(file_record) + session.commit() + session.refresh(file_record) + session.expunge(file_record) + return file_record + + def update_batch_upload_file( + self, + file_id: UUID, + **kwargs: Any, + ) -> None: + """Update batch upload file fields.""" + with get_session_context() as session: + file_record = session.get(BatchUploadFile, file_id) + if file_record: + for key, value in kwargs.items(): + if hasattr(file_record, key): + setattr(file_record, key, value) + session.add(file_record) + + def get_batch_upload_files( + self, + batch_id: UUID, + ) -> list[BatchUploadFile]: + """Get all files for a batch upload.""" + with get_session_context() as session: + statement = select(BatchUploadFile).where( + BatchUploadFile.batch_id == batch_id + ).order_by(BatchUploadFile.created_at) + + results = session.exec(statement).all() + for r in results: + session.expunge(r) + return list(results) + + def get_batch_uploads_by_token( + self, + admin_token: str | None = None, # Deprecated, kept for compatibility + limit: int = 50, + offset: int = 0, + ) -> tuple[list[BatchUpload], int]: + """Get paginated batch uploads. Token parameter is deprecated.""" + with get_session_context() as session: + # Count query (no token filtering) + count_stmt = select(func.count()).select_from(BatchUpload) + total = session.exec(count_stmt).one() + + # Fetch query (no token filtering) + statement = select(BatchUpload).order_by( + BatchUpload.created_at.desc() + ).offset(offset).limit(limit) + + results = session.exec(statement).all() + for r in results: + session.expunge(r) + return list(results), total + + # ========================================================================== + # Training Document Link Operations (v2) + # ========================================================================== + + def create_training_document_link( + self, + task_id: UUID, + document_id: UUID, + annotation_snapshot: dict[str, Any] | None = None, + ) -> TrainingDocumentLink: + """Create a training document link.""" + with get_session_context() as session: + link = TrainingDocumentLink( + task_id=task_id, + document_id=document_id, + annotation_snapshot=annotation_snapshot, + ) + session.add(link) + session.commit() + session.refresh(link) + session.expunge(link) + return link + + def get_training_document_links( + self, + task_id: UUID, + ) -> list[TrainingDocumentLink]: + """Get all document links for a training task.""" + with get_session_context() as session: + statement = select(TrainingDocumentLink).where( + TrainingDocumentLink.task_id == task_id + ).order_by(TrainingDocumentLink.created_at) + + results = session.exec(statement).all() + for r in results: + session.expunge(r) + return list(results) + + def get_document_training_tasks( + self, + document_id: UUID, + ) -> list[TrainingDocumentLink]: + """Get all training tasks that used this document.""" + with get_session_context() as session: + statement = select(TrainingDocumentLink).where( + TrainingDocumentLink.document_id == document_id + ).order_by(TrainingDocumentLink.created_at.desc()) + + results = session.exec(statement).all() + for r in results: + session.expunge(r) + return list(results) + + # ========================================================================== + # Annotation History Operations (v2) + # ========================================================================== + + def create_annotation_history( + self, + annotation_id: UUID, + document_id: UUID, + action: str, + previous_value: dict[str, Any] | None = None, + new_value: dict[str, Any] | None = None, + changed_by: str | None = None, + change_reason: str | None = None, + ) -> AnnotationHistory: + """Create an annotation history record.""" + with get_session_context() as session: + history = AnnotationHistory( + annotation_id=annotation_id, + document_id=document_id, + action=action, + previous_value=previous_value, + new_value=new_value, + changed_by=changed_by, + change_reason=change_reason, + ) + session.add(history) + session.commit() + session.refresh(history) + session.expunge(history) + return history + + def get_annotation_history( + self, + annotation_id: UUID, + ) -> list[AnnotationHistory]: + """Get history for a specific annotation.""" + with get_session_context() as session: + statement = select(AnnotationHistory).where( + AnnotationHistory.annotation_id == annotation_id + ).order_by(AnnotationHistory.created_at.desc()) + + results = session.exec(statement).all() + for r in results: + session.expunge(r) + return list(results) + + def get_document_annotation_history( + self, + document_id: UUID, + ) -> list[AnnotationHistory]: + """Get all annotation history for a document.""" + with get_session_context() as session: + statement = select(AnnotationHistory).where( + AnnotationHistory.document_id == document_id + ).order_by(AnnotationHistory.created_at.desc()) + + results = session.exec(statement).all() + for r in results: + session.expunge(r) + return list(results) + + # ========================================================================= + # Annotation Lock Methods + # ========================================================================= + + def acquire_annotation_lock( + self, + document_id: str, + admin_token: str | None = None, # Deprecated, kept for compatibility + duration_seconds: int = 300, + ) -> AdminDocument | None: + """Acquire annotation lock for a document. + + Returns the updated document if lock was acquired, None if failed. + """ + from datetime import datetime, timedelta, timezone + + with get_session_context() as session: + # Get document + doc = session.get(AdminDocument, UUID(document_id)) + if not doc: + return None + + # Check if already locked by someone else + now = datetime.now(timezone.utc) + if doc.annotation_lock_until and doc.annotation_lock_until > now: + # Document is already locked + return None + + # Acquire lock + doc.annotation_lock_until = now + timedelta(seconds=duration_seconds) + session.add(doc) + session.commit() + session.refresh(doc) + session.expunge(doc) + return doc + + def release_annotation_lock( + self, + document_id: str, + admin_token: str | None = None, # Deprecated, kept for compatibility + force: bool = False, + ) -> AdminDocument | None: + """Release annotation lock for a document. + + Args: + document_id: Document UUID + admin_token: Deprecated, kept for compatibility + force: If True, release lock even if expired (admin override) + + Returns the updated document if lock was released, None if failed. + """ + with get_session_context() as session: + # Get document + doc = session.get(AdminDocument, UUID(document_id)) + if not doc: + return None + + # Release lock + doc.annotation_lock_until = None + session.add(doc) + session.commit() + session.refresh(doc) + session.expunge(doc) + return doc + + def extend_annotation_lock( + self, + document_id: str, + admin_token: str | None = None, # Deprecated, kept for compatibility + additional_seconds: int = 300, + ) -> AdminDocument | None: + """Extend an existing annotation lock. + + Returns the updated document if lock was extended, None if failed. + """ + from datetime import datetime, timedelta, timezone + + with get_session_context() as session: + # Get document + doc = session.get(AdminDocument, UUID(document_id)) + if not doc: + return None + + # Check if lock exists and is still valid + now = datetime.now(timezone.utc) + if not doc.annotation_lock_until or doc.annotation_lock_until <= now: + # Lock doesn't exist or has expired + return None + + # Extend lock + doc.annotation_lock_until = doc.annotation_lock_until + timedelta(seconds=additional_seconds) + session.add(doc) + session.commit() + session.refresh(doc) + session.expunge(doc) + return doc + + # ========================================================================== + # Phase 4 & 5: Training Data Management and Annotation Enhancement + # ========================================================================== + + def get_documents_for_training( + self, + admin_token: str | None = None, # Deprecated, kept for compatibility + status: str = "labeled", + has_annotations: bool = True, + min_annotation_count: int | None = None, + exclude_used_in_training: bool = False, + limit: int = 100, + offset: int = 0, + ) -> tuple[list[AdminDocument], int]: + """Get documents suitable for training with filtering. + + Args: + admin_token: Deprecated, kept for compatibility + status: Document status filter (default: labeled) + has_annotations: Only include documents with annotations + min_annotation_count: Minimum annotation count filter + exclude_used_in_training: Exclude documents already used in training + limit: Page size + offset: Pagination offset + + Returns: + Tuple of (documents, total_count) + """ + with get_session_context() as session: + # Base query (no token filtering) + statement = select(AdminDocument).where( + AdminDocument.status == status, + ) + + # Filter by annotations if needed + if has_annotations or min_annotation_count: + # Join with annotations to filter + from sqlalchemy import exists + annotation_subq = ( + select(func.count(AdminAnnotation.annotation_id)) + .where(AdminAnnotation.document_id == AdminDocument.document_id) + .correlate(AdminDocument) + .scalar_subquery() + ) + + if has_annotations: + statement = statement.where(annotation_subq > 0) + + if min_annotation_count: + statement = statement.where(annotation_subq >= min_annotation_count) + + # Exclude documents used in training if requested + if exclude_used_in_training: + from sqlalchemy import exists + training_subq = exists( + select(1) + .select_from(TrainingDocumentLink) + .where(TrainingDocumentLink.document_id == AdminDocument.document_id) + ) + statement = statement.where(~training_subq) + + # Get total count + count_statement = select(func.count()).select_from(statement.subquery()) + total = session.exec(count_statement).one() + + # Apply pagination + statement = statement.order_by(AdminDocument.created_at.desc()) + statement = statement.limit(limit).offset(offset) + + # Execute query + results = session.exec(statement).all() + for r in results: + session.expunge(r) + + return list(results), total + + def verify_annotation( + self, + annotation_id: str, + admin_token: str, + ) -> AdminAnnotation | None: + """Mark an annotation as verified. + + Args: + annotation_id: Annotation UUID + admin_token: Admin token (recorded as verified_by) + + Returns: + Updated annotation or None if not found + """ + with get_session_context() as session: + annotation = session.get(AdminAnnotation, UUID(annotation_id)) + if not annotation: + return None + + # Mark as verified + annotation.is_verified = True + annotation.verified_at = datetime.utcnow() + annotation.verified_by = admin_token + annotation.updated_at = datetime.utcnow() + + session.add(annotation) + session.commit() + session.refresh(annotation) + session.expunge(annotation) + return annotation + + def override_annotation( + self, + annotation_id: str, + admin_token: str, + change_reason: str | None = None, + **updates: Any, + ) -> AdminAnnotation | None: + """Override an auto-generated annotation. + + This creates a history record and updates the annotation, marking it as + manually overridden. + + Args: + annotation_id: Annotation UUID + admin_token: Admin token + change_reason: Optional reason for override + **updates: Fields to update (bbox, text_value, etc.) + + Returns: + Updated annotation or None if not found + """ + with get_session_context() as session: + annotation = session.get(AdminAnnotation, UUID(annotation_id)) + if not annotation: + return None + + # Save previous state + previous_value = { + "class_id": annotation.class_id, + "class_name": annotation.class_name, + "bbox": { + "x": annotation.bbox_x, + "y": annotation.bbox_y, + "width": annotation.bbox_width, + "height": annotation.bbox_height, + }, + "normalized": { + "x_center": annotation.x_center, + "y_center": annotation.y_center, + "width": annotation.width, + "height": annotation.height, + }, + "text_value": annotation.text_value, + "confidence": annotation.confidence, + "source": annotation.source, + } + + # Apply updates + for key, value in updates.items(): + if hasattr(annotation, key): + setattr(annotation, key, value) + + # Mark as overridden if was auto-generated + if annotation.source == "auto": + annotation.override_source = "auto" + annotation.source = "manual" + + annotation.updated_at = datetime.utcnow() + session.add(annotation) + + # Create history record + history = AnnotationHistory( + annotation_id=UUID(annotation_id), + document_id=annotation.document_id, + action="override", + previous_value=previous_value, + new_value=updates, + changed_by=admin_token, + change_reason=change_reason, + ) + session.add(history) + + session.commit() + session.refresh(annotation) + session.expunge(annotation) + return annotation diff --git a/src/data/admin_models.py b/src/data/admin_models.py new file mode 100644 index 0000000..748bfd4 --- /dev/null +++ b/src/data/admin_models.py @@ -0,0 +1,339 @@ +""" +Admin API SQLModel Database Models + +Defines the database schema for admin document management, annotations, and training tasks. +Includes batch upload support, training document links, and annotation history. +""" + +from datetime import datetime +from typing import Any +from uuid import UUID, uuid4 + +from sqlmodel import Field, SQLModel, Column, JSON + + +# ============================================================================= +# CSV to Field Class Mapping +# ============================================================================= + +CSV_TO_CLASS_MAPPING: dict[str, int] = { + "InvoiceNumber": 0, # invoice_number + "InvoiceDate": 1, # invoice_date + "InvoiceDueDate": 2, # invoice_due_date + "OCR": 3, # ocr_number + "Bankgiro": 4, # bankgiro + "Plusgiro": 5, # plusgiro + "Amount": 6, # amount + "supplier_organisation_number": 7, # supplier_organisation_number + # 8: payment_line (derived from OCR/Bankgiro/Amount) + "customer_number": 9, # customer_number +} + + +# ============================================================================= +# Core Models +# ============================================================================= + + +class AdminToken(SQLModel, table=True): + """Admin authentication token.""" + + __tablename__ = "admin_tokens" + + token: str = Field(primary_key=True, max_length=255) + name: str = Field(max_length=255) + is_active: bool = Field(default=True) + created_at: datetime = Field(default_factory=datetime.utcnow) + last_used_at: datetime | None = Field(default=None) + expires_at: datetime | None = Field(default=None) + + +class AdminDocument(SQLModel, table=True): + """Document uploaded for labeling/annotation.""" + + __tablename__ = "admin_documents" + + document_id: UUID = Field(default_factory=uuid4, primary_key=True) + admin_token: str | None = Field(default=None, foreign_key="admin_tokens.token", max_length=255, index=True) + filename: str = Field(max_length=255) + file_size: int + content_type: str = Field(max_length=100) + file_path: str = Field(max_length=512) # Path to stored file + page_count: int = Field(default=1) + status: str = Field(default="pending", max_length=20, index=True) + # Status: pending, auto_labeling, labeled, exported + auto_label_status: str | None = Field(default=None, max_length=20) + # Auto-label status: running, completed, failed + auto_label_error: str | None = Field(default=None) + # v2: Upload source tracking + upload_source: str = Field(default="ui", max_length=20) + # Upload source: ui, api + batch_id: UUID | None = Field(default=None, index=True) + # Link to batch upload (if uploaded via ZIP) + csv_field_values: dict[str, Any] | None = Field(default=None, sa_column=Column(JSON)) + # Original CSV values for reference + auto_label_queued_at: datetime | None = Field(default=None) + # When auto-label was queued + annotation_lock_until: datetime | None = Field(default=None) + # Lock for manual annotation while auto-label runs + created_at: datetime = Field(default_factory=datetime.utcnow) + updated_at: datetime = Field(default_factory=datetime.utcnow) + + +class AdminAnnotation(SQLModel, table=True): + """Annotation for a document (bounding box + label).""" + + __tablename__ = "admin_annotations" + + annotation_id: UUID = Field(default_factory=uuid4, primary_key=True) + document_id: UUID = Field(foreign_key="admin_documents.document_id", index=True) + page_number: int = Field(default=1) # 1-indexed + class_id: int # 0-9 for invoice fields + class_name: str = Field(max_length=50) # e.g., "invoice_number" + # Bounding box (normalized 0-1 coordinates) + x_center: float + y_center: float + width: float + height: float + # Original pixel coordinates (for display) + bbox_x: int + bbox_y: int + bbox_width: int + bbox_height: int + # OCR extracted text (if available) + text_value: str | None = Field(default=None) + confidence: float | None = Field(default=None) + # Source: manual, auto, imported + source: str = Field(default="manual", max_length=20, index=True) + # v2: Verification fields + is_verified: bool = Field(default=False, index=True) + verified_at: datetime | None = Field(default=None) + verified_by: str | None = Field(default=None, max_length=255) + # v2: Override tracking + override_source: str | None = Field(default=None, max_length=20) + # If this annotation overrides another: 'auto' or 'imported' + original_annotation_id: UUID | None = Field(default=None) + # Reference to the annotation this overrides + created_at: datetime = Field(default_factory=datetime.utcnow) + updated_at: datetime = Field(default_factory=datetime.utcnow) + + +class TrainingTask(SQLModel, table=True): + """Training/fine-tuning task.""" + + __tablename__ = "training_tasks" + + task_id: UUID = Field(default_factory=uuid4, primary_key=True) + admin_token: str = Field(foreign_key="admin_tokens.token", max_length=255, index=True) + name: str = Field(max_length=255) + description: str | None = Field(default=None) + status: str = Field(default="pending", max_length=20, index=True) + # Status: pending, scheduled, running, completed, failed, cancelled + task_type: str = Field(default="train", max_length=20) + # Task type: train, finetune + # Training configuration + config: dict[str, Any] | None = Field(default=None, sa_column=Column(JSON)) + # Schedule settings + scheduled_at: datetime | None = Field(default=None) + cron_expression: str | None = Field(default=None, max_length=50) + is_recurring: bool = Field(default=False) + # Execution details + started_at: datetime | None = Field(default=None) + completed_at: datetime | None = Field(default=None) + error_message: str | None = Field(default=None) + # Result metrics + result_metrics: dict[str, Any] | None = Field(default=None, sa_column=Column(JSON)) + model_path: str | None = Field(default=None, max_length=512) + # v2: Document count and extracted metrics + document_count: int = Field(default=0) + # Count of documents used in training + metrics_mAP: float | None = Field(default=None, index=True) + metrics_precision: float | None = Field(default=None) + metrics_recall: float | None = Field(default=None) + # Extracted metrics for easy querying + created_at: datetime = Field(default_factory=datetime.utcnow) + updated_at: datetime = Field(default_factory=datetime.utcnow) + + +class TrainingLog(SQLModel, table=True): + """Training log entry.""" + + __tablename__ = "training_logs" + + log_id: int | None = Field(default=None, primary_key=True) + task_id: UUID = Field(foreign_key="training_tasks.task_id", index=True) + level: str = Field(max_length=20) # INFO, WARNING, ERROR + message: str + details: dict[str, Any] | None = Field(default=None, sa_column=Column(JSON)) + created_at: datetime = Field(default_factory=datetime.utcnow, index=True) + + +# ============================================================================= +# Batch Upload Models (v2) +# ============================================================================= + + +class BatchUpload(SQLModel, table=True): + """Batch upload of multiple documents via ZIP file.""" + + __tablename__ = "batch_uploads" + + batch_id: UUID = Field(default_factory=uuid4, primary_key=True) + admin_token: str = Field(foreign_key="admin_tokens.token", max_length=255, index=True) + filename: str = Field(max_length=255) # ZIP filename + file_size: int + upload_source: str = Field(default="ui", max_length=20) + # Upload source: ui, api + status: str = Field(default="processing", max_length=20, index=True) + # Status: processing, completed, partial, failed + total_files: int = Field(default=0) + processed_files: int = Field(default=0) + # Number of files processed so far + successful_files: int = Field(default=0) + failed_files: int = Field(default=0) + csv_filename: str | None = Field(default=None, max_length=255) + # CSV file used for auto-labeling + csv_row_count: int | None = Field(default=None) + error_message: str | None = Field(default=None) + created_at: datetime = Field(default_factory=datetime.utcnow) + completed_at: datetime | None = Field(default=None) + + +class BatchUploadFile(SQLModel, table=True): + """Individual file within a batch upload.""" + + __tablename__ = "batch_upload_files" + + file_id: UUID = Field(default_factory=uuid4, primary_key=True) + batch_id: UUID = Field(foreign_key="batch_uploads.batch_id", index=True) + filename: str = Field(max_length=255) # PDF filename within ZIP + document_id: UUID | None = Field(default=None) + # Link to created AdminDocument (if successful) + status: str = Field(default="pending", max_length=20, index=True) + # Status: pending, processing, completed, failed, skipped + error_message: str | None = Field(default=None) + annotation_count: int = Field(default=0) + # Number of annotations created for this file + csv_row_data: dict[str, Any] | None = Field(default=None, sa_column=Column(JSON)) + # CSV row data for this file (if available) + created_at: datetime = Field(default_factory=datetime.utcnow) + processed_at: datetime | None = Field(default=None) + + +# ============================================================================= +# Training Document Link (v2) +# ============================================================================= + + +class TrainingDocumentLink(SQLModel, table=True): + """Junction table linking training tasks to documents.""" + + __tablename__ = "training_document_links" + + link_id: UUID = Field(default_factory=uuid4, primary_key=True) + task_id: UUID = Field(foreign_key="training_tasks.task_id", index=True) + document_id: UUID = Field(foreign_key="admin_documents.document_id", index=True) + annotation_snapshot: dict[str, Any] | None = Field(default=None, sa_column=Column(JSON)) + # Snapshot of annotations at training time (includes count, verified count, etc.) + created_at: datetime = Field(default_factory=datetime.utcnow) + + +# ============================================================================= +# Annotation History (v2) +# ============================================================================= + + +class AnnotationHistory(SQLModel, table=True): + """History of annotation changes (for override tracking).""" + + __tablename__ = "annotation_history" + + history_id: UUID = Field(default_factory=uuid4, primary_key=True) + annotation_id: UUID = Field(foreign_key="admin_annotations.annotation_id", index=True) + document_id: UUID = Field(foreign_key="admin_documents.document_id", index=True) + # Change action: created, updated, deleted, override + action: str = Field(max_length=20, index=True) + # Previous value (for updates/deletes) + previous_value: dict[str, Any] | None = Field(default=None, sa_column=Column(JSON)) + # New value (for creates/updates) + new_value: dict[str, Any] | None = Field(default=None, sa_column=Column(JSON)) + # Change metadata + changed_by: str | None = Field(default=None, max_length=255) + # User/token who made the change + change_reason: str | None = Field(default=None) + # Optional reason for change + created_at: datetime = Field(default_factory=datetime.utcnow, index=True) + + +# Field class mapping (same as src/cli/train.py) +FIELD_CLASSES = { + 0: "invoice_number", + 1: "invoice_date", + 2: "invoice_due_date", + 3: "ocr_number", + 4: "bankgiro", + 5: "plusgiro", + 6: "amount", + 7: "supplier_organisation_number", + 8: "payment_line", + 9: "customer_number", +} + +FIELD_CLASS_IDS = {v: k for k, v in FIELD_CLASSES.items()} + + +# Read-only models for API responses +class AdminDocumentRead(SQLModel): + """Admin document response model.""" + + document_id: UUID + filename: str + file_size: int + content_type: str + page_count: int + status: str + auto_label_status: str | None + auto_label_error: str | None + created_at: datetime + updated_at: datetime + + +class AdminAnnotationRead(SQLModel): + """Admin annotation response model.""" + + annotation_id: UUID + document_id: UUID + page_number: int + class_id: int + class_name: str + x_center: float + y_center: float + width: float + height: float + bbox_x: int + bbox_y: int + bbox_width: int + bbox_height: int + text_value: str | None + confidence: float | None + source: str + created_at: datetime + + +class TrainingTaskRead(SQLModel): + """Training task response model.""" + + task_id: UUID + name: str + description: str | None + status: str + task_type: str + config: dict[str, Any] | None + scheduled_at: datetime | None + is_recurring: bool + started_at: datetime | None + completed_at: datetime | None + error_message: str | None + result_metrics: dict[str, Any] | None + model_path: str | None + created_at: datetime diff --git a/src/data/async_request_db.py b/src/data/async_request_db.py new file mode 100644 index 0000000..d3853f5 --- /dev/null +++ b/src/data/async_request_db.py @@ -0,0 +1,374 @@ +""" +Async Request Database Operations + +Database interface for async invoice processing requests using SQLModel. +""" + +import logging +from datetime import datetime, timedelta +from typing import Any +from uuid import UUID + +from sqlalchemy import func, text +from sqlmodel import Session, select + +from src.data.database import get_session_context, create_db_and_tables, close_engine +from src.data.models import ApiKey, AsyncRequest, RateLimitEvent + +logger = logging.getLogger(__name__) + + +# Legacy dataclasses for backward compatibility +from dataclasses import dataclass + + +@dataclass(frozen=True) +class ApiKeyConfig: + """API key configuration and limits (legacy compatibility).""" + + api_key: str + name: str + is_active: bool + requests_per_minute: int + max_concurrent_jobs: int + max_file_size_mb: int + + +class AsyncRequestDB: + """Database interface for async processing requests using SQLModel.""" + + def __init__(self, connection_string: str | None = None) -> None: + # connection_string is kept for backward compatibility but ignored + # SQLModel uses the global engine from database.py + self._initialized = False + + def connect(self): + """Legacy method - returns self for compatibility.""" + return self + + def close(self) -> None: + """Close database connections.""" + close_engine() + + def __enter__(self) -> "AsyncRequestDB": + return self + + def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: + pass # Sessions are managed per-operation + + def create_tables(self) -> None: + """Create async processing tables if they don't exist.""" + create_db_and_tables() + self._initialized = True + + # ========================================================================== + # API Key Operations + # ========================================================================== + + def is_valid_api_key(self, api_key: str) -> bool: + """Check if API key exists and is active.""" + with get_session_context() as session: + result = session.get(ApiKey, api_key) + return result is not None and result.is_active is True + + def get_api_key_config(self, api_key: str) -> ApiKeyConfig | None: + """Get API key configuration and limits.""" + with get_session_context() as session: + result = session.get(ApiKey, api_key) + if result is None: + return None + return ApiKeyConfig( + api_key=result.api_key, + name=result.name, + is_active=result.is_active, + requests_per_minute=result.requests_per_minute, + max_concurrent_jobs=result.max_concurrent_jobs, + max_file_size_mb=result.max_file_size_mb, + ) + + def create_api_key( + self, + api_key: str, + name: str, + requests_per_minute: int = 10, + max_concurrent_jobs: int = 3, + max_file_size_mb: int = 50, + ) -> None: + """Create a new API key.""" + with get_session_context() as session: + existing = session.get(ApiKey, api_key) + if existing: + existing.name = name + existing.requests_per_minute = requests_per_minute + existing.max_concurrent_jobs = max_concurrent_jobs + existing.max_file_size_mb = max_file_size_mb + session.add(existing) + else: + new_key = ApiKey( + api_key=api_key, + name=name, + requests_per_minute=requests_per_minute, + max_concurrent_jobs=max_concurrent_jobs, + max_file_size_mb=max_file_size_mb, + ) + session.add(new_key) + + def update_api_key_usage(self, api_key: str) -> None: + """Update API key last used timestamp and increment total requests.""" + with get_session_context() as session: + key = session.get(ApiKey, api_key) + if key: + key.last_used_at = datetime.utcnow() + key.total_requests += 1 + session.add(key) + + # ========================================================================== + # Async Request Operations + # ========================================================================== + + def create_request( + self, + api_key: str, + filename: str, + file_size: int, + content_type: str, + expires_at: datetime, + request_id: str | None = None, + ) -> str: + """Create a new async request.""" + with get_session_context() as session: + request = AsyncRequest( + api_key=api_key, + filename=filename, + file_size=file_size, + content_type=content_type, + expires_at=expires_at, + ) + if request_id: + request.request_id = UUID(request_id) + session.add(request) + session.flush() # To get the generated ID + return str(request.request_id) + + def get_request(self, request_id: str) -> AsyncRequest | None: + """Get a single async request by ID.""" + with get_session_context() as session: + result = session.get(AsyncRequest, UUID(request_id)) + if result: + # Detach from session for use outside context + session.expunge(result) + return result + + def get_request_by_api_key( + self, + request_id: str, + api_key: str, + ) -> AsyncRequest | None: + """Get a request only if it belongs to the given API key.""" + with get_session_context() as session: + statement = select(AsyncRequest).where( + AsyncRequest.request_id == UUID(request_id), + AsyncRequest.api_key == api_key, + ) + result = session.exec(statement).first() + if result: + session.expunge(result) + return result + + def update_status( + self, + request_id: str, + status: str, + error_message: str | None = None, + increment_retry: bool = False, + ) -> None: + """Update request status.""" + with get_session_context() as session: + request = session.get(AsyncRequest, UUID(request_id)) + if request: + request.status = status + if status == "processing": + request.started_at = datetime.utcnow() + if error_message is not None: + request.error_message = error_message + if increment_retry: + request.retry_count += 1 + session.add(request) + + def complete_request( + self, + request_id: str, + document_id: str, + result: dict[str, Any], + processing_time_ms: float, + visualization_path: str | None = None, + ) -> None: + """Mark request as completed with result.""" + with get_session_context() as session: + request = session.get(AsyncRequest, UUID(request_id)) + if request: + request.status = "completed" + request.document_id = document_id + request.result = result + request.processing_time_ms = processing_time_ms + request.visualization_path = visualization_path + request.completed_at = datetime.utcnow() + session.add(request) + + def get_requests_by_api_key( + self, + api_key: str, + status: str | None = None, + limit: int = 20, + offset: int = 0, + ) -> tuple[list[AsyncRequest], int]: + """Get paginated requests for an API key.""" + with get_session_context() as session: + # Count query + count_stmt = select(func.count()).select_from(AsyncRequest).where( + AsyncRequest.api_key == api_key + ) + if status: + count_stmt = count_stmt.where(AsyncRequest.status == status) + total = session.exec(count_stmt).one() + + # Fetch query + statement = select(AsyncRequest).where( + AsyncRequest.api_key == api_key + ) + if status: + statement = statement.where(AsyncRequest.status == status) + statement = statement.order_by(AsyncRequest.created_at.desc()) + statement = statement.offset(offset).limit(limit) + + results = session.exec(statement).all() + # Detach results from session + for r in results: + session.expunge(r) + return list(results), total + + def count_active_jobs(self, api_key: str) -> int: + """Count active (pending + processing) jobs for an API key.""" + with get_session_context() as session: + statement = select(func.count()).select_from(AsyncRequest).where( + AsyncRequest.api_key == api_key, + AsyncRequest.status.in_(["pending", "processing"]), + ) + return session.exec(statement).one() + + def get_pending_requests(self, limit: int = 10) -> list[AsyncRequest]: + """Get pending requests ordered by creation time.""" + with get_session_context() as session: + statement = select(AsyncRequest).where( + AsyncRequest.status == "pending" + ).order_by(AsyncRequest.created_at).limit(limit) + results = session.exec(statement).all() + for r in results: + session.expunge(r) + return list(results) + + def get_queue_position(self, request_id: str) -> int | None: + """Get position of a request in the pending queue.""" + with get_session_context() as session: + # Get the request's created_at + request = session.get(AsyncRequest, UUID(request_id)) + if not request: + return None + + # Count pending requests created before this one + statement = select(func.count()).select_from(AsyncRequest).where( + AsyncRequest.status == "pending", + AsyncRequest.created_at < request.created_at, + ) + count = session.exec(statement).one() + return count + 1 # 1-based position + + # ========================================================================== + # Rate Limit Operations + # ========================================================================== + + def record_rate_limit_event(self, api_key: str, event_type: str) -> None: + """Record a rate limit event.""" + with get_session_context() as session: + event = RateLimitEvent( + api_key=api_key, + event_type=event_type, + ) + session.add(event) + + def count_recent_requests(self, api_key: str, seconds: int = 60) -> int: + """Count requests in the last N seconds.""" + with get_session_context() as session: + cutoff = datetime.utcnow() - timedelta(seconds=seconds) + statement = select(func.count()).select_from(RateLimitEvent).where( + RateLimitEvent.api_key == api_key, + RateLimitEvent.event_type == "request", + RateLimitEvent.created_at > cutoff, + ) + return session.exec(statement).one() + + # ========================================================================== + # Cleanup Operations + # ========================================================================== + + def delete_expired_requests(self) -> int: + """Delete requests that have expired. Returns count of deleted rows.""" + with get_session_context() as session: + now = datetime.utcnow() + statement = select(AsyncRequest).where(AsyncRequest.expires_at < now) + expired = session.exec(statement).all() + count = len(expired) + for request in expired: + session.delete(request) + logger.info(f"Deleted {count} expired async requests") + return count + + def cleanup_old_rate_limit_events(self, hours: int = 1) -> int: + """Delete rate limit events older than N hours.""" + with get_session_context() as session: + cutoff = datetime.utcnow() - timedelta(hours=hours) + statement = select(RateLimitEvent).where( + RateLimitEvent.created_at < cutoff + ) + old_events = session.exec(statement).all() + count = len(old_events) + for event in old_events: + session.delete(event) + return count + + def reset_stale_processing_requests( + self, + stale_minutes: int = 10, + max_retries: int = 3, + ) -> int: + """ + Reset requests stuck in 'processing' status. + + Requests that have been processing for more than stale_minutes + are considered stale. They are either reset to 'pending' (if under + max_retries) or set to 'failed'. + """ + with get_session_context() as session: + cutoff = datetime.utcnow() - timedelta(minutes=stale_minutes) + reset_count = 0 + + # Find stale processing requests + statement = select(AsyncRequest).where( + AsyncRequest.status == "processing", + AsyncRequest.started_at < cutoff, + ) + stale_requests = session.exec(statement).all() + + for request in stale_requests: + if request.retry_count < max_retries: + request.status = "pending" + request.started_at = None + else: + request.status = "failed" + request.error_message = "Processing timeout after max retries" + session.add(request) + reset_count += 1 + + if reset_count > 0: + logger.warning(f"Reset {reset_count} stale processing requests") + return reset_count diff --git a/src/data/database.py b/src/data/database.py new file mode 100644 index 0000000..d356653 --- /dev/null +++ b/src/data/database.py @@ -0,0 +1,103 @@ +""" +Database Engine and Session Management + +Provides SQLModel database engine and session handling. +""" + +import logging +from contextlib import contextmanager +from pathlib import Path +from typing import Generator + +from sqlalchemy import text +from sqlmodel import Session, SQLModel, create_engine + +import sys +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) +from src.config import get_db_connection_string + +logger = logging.getLogger(__name__) + +# Global engine instance +_engine = None + + +def get_engine(): + """Get or create the database engine.""" + global _engine + if _engine is None: + connection_string = get_db_connection_string() + # Convert psycopg2 format to SQLAlchemy format + if connection_string.startswith("postgresql://"): + # Already in correct format + pass + elif "host=" in connection_string: + # Convert DSN format to URL format + parts = dict(item.split("=") for item in connection_string.split()) + connection_string = ( + f"postgresql://{parts.get('user', '')}:{parts.get('password', '')}" + f"@{parts.get('host', 'localhost')}:{parts.get('port', '5432')}" + f"/{parts.get('dbname', 'docmaster')}" + ) + + _engine = create_engine( + connection_string, + echo=False, # Set to True for SQL debugging + pool_pre_ping=True, # Verify connections before use + pool_size=5, + max_overflow=10, + ) + return _engine + + +def create_db_and_tables() -> None: + """Create all database tables.""" + from src.data.models import ApiKey, AsyncRequest, RateLimitEvent # noqa: F401 + from src.data.admin_models import ( # noqa: F401 + AdminToken, + AdminDocument, + AdminAnnotation, + TrainingTask, + TrainingLog, + ) + + engine = get_engine() + SQLModel.metadata.create_all(engine) + logger.info("Database tables created/verified") + + +def get_session() -> Session: + """Get a new database session.""" + engine = get_engine() + return Session(engine) + + +@contextmanager +def get_session_context() -> Generator[Session, None, None]: + """Context manager for database sessions with auto-commit/rollback.""" + session = get_session() + try: + yield session + session.commit() + except Exception: + session.rollback() + raise + finally: + session.close() + + +def close_engine() -> None: + """Close the database engine and release connections.""" + global _engine + if _engine is not None: + _engine.dispose() + _engine = None + logger.info("Database engine closed") + + +def execute_raw_sql(sql: str) -> None: + """Execute raw SQL (for migrations).""" + engine = get_engine() + with engine.connect() as conn: + conn.execute(text(sql)) + conn.commit() diff --git a/src/data/db.py b/src/data/db.py index 3bd0a4b..ff22a98 100644 --- a/src/data/db.py +++ b/src/data/db.py @@ -10,7 +10,7 @@ import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent.parent)) -from config import get_db_connection_string +from src.config import get_db_connection_string class DocumentDB: diff --git a/src/data/migrations/001_async_tables.sql b/src/data/migrations/001_async_tables.sql new file mode 100644 index 0000000..fb3e267 --- /dev/null +++ b/src/data/migrations/001_async_tables.sql @@ -0,0 +1,83 @@ +-- Async Invoice Processing Tables +-- Migration: 001_async_tables.sql +-- Created: 2024-01-15 + +-- API Keys table for authentication and rate limiting +CREATE TABLE IF NOT EXISTS api_keys ( + api_key TEXT PRIMARY KEY, + name TEXT NOT NULL, + is_active BOOLEAN DEFAULT true, + + -- Rate limits + requests_per_minute INTEGER DEFAULT 10, + max_concurrent_jobs INTEGER DEFAULT 3, + max_file_size_mb INTEGER DEFAULT 50, + + -- Usage tracking + total_requests INTEGER DEFAULT 0, + total_processed INTEGER DEFAULT 0, + + -- Timestamps + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + last_used_at TIMESTAMPTZ +); + +-- Async processing requests table +CREATE TABLE IF NOT EXISTS async_requests ( + request_id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + api_key TEXT NOT NULL REFERENCES api_keys(api_key) ON DELETE CASCADE, + status TEXT NOT NULL DEFAULT 'pending', + filename TEXT NOT NULL, + file_size INTEGER NOT NULL, + content_type TEXT NOT NULL, + + -- Processing metadata + document_id TEXT, + error_message TEXT, + retry_count INTEGER DEFAULT 0, + + -- Timestamps + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + started_at TIMESTAMPTZ, + completed_at TIMESTAMPTZ, + expires_at TIMESTAMPTZ NOT NULL, + + -- Result storage (JSONB for flexibility) + result JSONB, + + -- Processing time + processing_time_ms REAL, + + -- Visualization path + visualization_path TEXT, + + CONSTRAINT valid_status CHECK (status IN ('pending', 'processing', 'completed', 'failed')) +); + +-- Indexes for async_requests +CREATE INDEX IF NOT EXISTS idx_async_requests_api_key ON async_requests(api_key); +CREATE INDEX IF NOT EXISTS idx_async_requests_status ON async_requests(status); +CREATE INDEX IF NOT EXISTS idx_async_requests_created_at ON async_requests(created_at); +CREATE INDEX IF NOT EXISTS idx_async_requests_expires_at ON async_requests(expires_at); +CREATE INDEX IF NOT EXISTS idx_async_requests_api_key_status ON async_requests(api_key, status); + +-- Rate limit tracking table +CREATE TABLE IF NOT EXISTS rate_limit_events ( + id SERIAL PRIMARY KEY, + api_key TEXT NOT NULL REFERENCES api_keys(api_key) ON DELETE CASCADE, + event_type TEXT NOT NULL, -- 'request', 'complete', 'fail' + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +-- Index for rate limiting queries (recent events only) +CREATE INDEX IF NOT EXISTS idx_rate_limit_events_api_key_time +ON rate_limit_events(api_key, created_at DESC); + +-- Cleanup old rate limit events index +CREATE INDEX IF NOT EXISTS idx_rate_limit_events_cleanup +ON rate_limit_events(created_at); + +-- Insert default API key for development/testing +INSERT INTO api_keys (api_key, name, requests_per_minute, max_concurrent_jobs) +VALUES ('dev-api-key-12345', 'Development Key', 100, 10) +ON CONFLICT (api_key) DO NOTHING; diff --git a/src/data/migrations/002_nullable_admin_token.sql b/src/data/migrations/002_nullable_admin_token.sql new file mode 100644 index 0000000..29c406c --- /dev/null +++ b/src/data/migrations/002_nullable_admin_token.sql @@ -0,0 +1,5 @@ +-- Migration: Make admin_token nullable in admin_documents table +-- This allows documents uploaded via public API to not require an admin token + +ALTER TABLE admin_documents +ALTER COLUMN admin_token DROP NOT NULL; diff --git a/src/data/models.py b/src/data/models.py new file mode 100644 index 0000000..fbfa59e --- /dev/null +++ b/src/data/models.py @@ -0,0 +1,95 @@ +""" +SQLModel Database Models + +Defines the database schema using SQLModel (SQLAlchemy + Pydantic). +""" + +from datetime import datetime +from typing import Any +from uuid import UUID, uuid4 + +from sqlmodel import Field, SQLModel, Column, JSON + + +class ApiKey(SQLModel, table=True): + """API key configuration and limits.""" + + __tablename__ = "api_keys" + + api_key: str = Field(primary_key=True, max_length=255) + name: str = Field(max_length=255) + is_active: bool = Field(default=True) + requests_per_minute: int = Field(default=10) + max_concurrent_jobs: int = Field(default=3) + max_file_size_mb: int = Field(default=50) + total_requests: int = Field(default=0) + total_processed: int = Field(default=0) + created_at: datetime = Field(default_factory=datetime.utcnow) + last_used_at: datetime | None = Field(default=None) + + +class AsyncRequest(SQLModel, table=True): + """Async request record.""" + + __tablename__ = "async_requests" + + request_id: UUID = Field(default_factory=uuid4, primary_key=True) + api_key: str = Field(foreign_key="api_keys.api_key", max_length=255, index=True) + status: str = Field(default="pending", max_length=20, index=True) + filename: str = Field(max_length=255) + file_size: int + content_type: str = Field(max_length=100) + document_id: str | None = Field(default=None, max_length=100) + error_message: str | None = Field(default=None) + retry_count: int = Field(default=0) + created_at: datetime = Field(default_factory=datetime.utcnow) + started_at: datetime | None = Field(default=None) + completed_at: datetime | None = Field(default=None) + expires_at: datetime = Field(index=True) + result: dict[str, Any] | None = Field(default=None, sa_column=Column(JSON)) + processing_time_ms: float | None = Field(default=None) + visualization_path: str | None = Field(default=None, max_length=255) + + +class RateLimitEvent(SQLModel, table=True): + """Rate limit event record.""" + + __tablename__ = "rate_limit_events" + + id: int | None = Field(default=None, primary_key=True) + api_key: str = Field(foreign_key="api_keys.api_key", max_length=255, index=True) + event_type: str = Field(max_length=50) + created_at: datetime = Field(default_factory=datetime.utcnow, index=True) + + +# Read-only models for responses (without table=True) +class ApiKeyRead(SQLModel): + """API key response model (read-only).""" + + api_key: str + name: str + is_active: bool + requests_per_minute: int + max_concurrent_jobs: int + max_file_size_mb: int + + +class AsyncRequestRead(SQLModel): + """Async request response model (read-only).""" + + request_id: UUID + api_key: str + status: str + filename: str + file_size: int + content_type: str + document_id: str | None + error_message: str | None + retry_count: int + created_at: datetime + started_at: datetime | None + completed_at: datetime | None + expires_at: datetime + result: dict[str, Any] | None + processing_time_ms: float | None + visualization_path: str | None diff --git a/src/processing/autolabel_tasks.py b/src/processing/autolabel_tasks.py index ebe2d5f..bdd8855 100644 --- a/src/processing/autolabel_tasks.py +++ b/src/processing/autolabel_tasks.py @@ -12,6 +12,8 @@ import warnings from pathlib import Path from typing import Any, Dict, Optional +from src.config import DEFAULT_DPI + # Global OCR instance (initialized once per GPU worker process) _ocr_engine: Optional[Any] = None @@ -94,7 +96,7 @@ def process_text_pdf(task_data: Dict[str, Any]) -> Dict[str, Any]: row_dict = task_data["row_dict"] pdf_path = Path(task_data["pdf_path"]) output_dir = Path(task_data["output_dir"]) - dpi = task_data.get("dpi", 150) + dpi = task_data.get("dpi", DEFAULT_DPI) min_confidence = task_data.get("min_confidence", 0.5) start_time = time.time() @@ -212,7 +214,7 @@ def process_scanned_pdf(task_data: Dict[str, Any]) -> Dict[str, Any]: row_dict = task_data["row_dict"] pdf_path = Path(task_data["pdf_path"]) output_dir = Path(task_data["output_dir"]) - dpi = task_data.get("dpi", 150) + dpi = task_data.get("dpi", DEFAULT_DPI) min_confidence = task_data.get("min_confidence", 0.5) start_time = time.time() diff --git a/src/validation/llm_validator.py b/src/validation/llm_validator.py index 51c0c27..66a60b3 100644 --- a/src/validation/llm_validator.py +++ b/src/validation/llm_validator.py @@ -16,6 +16,8 @@ from datetime import datetime import psycopg2 from psycopg2.extras import execute_values +from src.config import DEFAULT_DPI + @dataclass class LLMExtractionResult: @@ -265,7 +267,7 @@ Return ONLY the JSON object, no other text.""" self, pdf_path: Path, page_no: int = 0, - dpi: int = 150, + dpi: int = DEFAULT_DPI, max_size_mb: float = 18.0 ) -> bytes: """ diff --git a/src/web/admin_routes_new.py b/src/web/admin_routes_new.py new file mode 100644 index 0000000..1e64889 --- /dev/null +++ b/src/web/admin_routes_new.py @@ -0,0 +1,8 @@ +""" +Backward compatibility shim for admin_routes.py + +DEPRECATED: Import from src.web.api.v1.admin.documents instead. +""" +from src.web.api.v1.admin.documents import * + +__all__ = ["create_admin_router"] diff --git a/src/web/api/__init__.py b/src/web/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/web/api/v1/__init__.py b/src/web/api/v1/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/web/api/v1/admin/__init__.py b/src/web/api/v1/admin/__init__.py new file mode 100644 index 0000000..95ee920 --- /dev/null +++ b/src/web/api/v1/admin/__init__.py @@ -0,0 +1,19 @@ +""" +Admin API v1 + +Document management, annotations, and training endpoints. +""" + +from src.web.api.v1.admin.annotations import create_annotation_router +from src.web.api.v1.admin.auth import create_auth_router +from src.web.api.v1.admin.documents import create_documents_router +from src.web.api.v1.admin.locks import create_locks_router +from src.web.api.v1.admin.training import create_training_router + +__all__ = [ + "create_annotation_router", + "create_auth_router", + "create_documents_router", + "create_locks_router", + "create_training_router", +] diff --git a/src/web/api/v1/admin/annotations.py b/src/web/api/v1/admin/annotations.py new file mode 100644 index 0000000..b67cb09 --- /dev/null +++ b/src/web/api/v1/admin/annotations.py @@ -0,0 +1,644 @@ +""" +Admin Annotation API Routes + +FastAPI endpoints for annotation management. +""" + +import logging +from pathlib import Path +from typing import Annotated +from uuid import UUID + +from fastapi import APIRouter, HTTPException, Query +from fastapi.responses import FileResponse + +from src.data.admin_db import AdminDB +from src.data.admin_models import FIELD_CLASSES, FIELD_CLASS_IDS +from src.web.core.auth import AdminTokenDep, AdminDBDep +from src.web.services.autolabel import get_auto_label_service +from src.web.schemas.admin import ( + AnnotationCreate, + AnnotationItem, + AnnotationListResponse, + AnnotationOverrideRequest, + AnnotationOverrideResponse, + AnnotationResponse, + AnnotationSource, + AnnotationUpdate, + AnnotationVerifyRequest, + AnnotationVerifyResponse, + AutoLabelRequest, + AutoLabelResponse, + BoundingBox, +) +from src.web.schemas.common import ErrorResponse + +logger = logging.getLogger(__name__) + +# Image storage directory +ADMIN_IMAGES_DIR = Path("data/admin_images") + + +def _validate_uuid(value: str, name: str = "ID") -> None: + """Validate UUID format.""" + try: + UUID(value) + except ValueError: + raise HTTPException( + status_code=400, + detail=f"Invalid {name} format. Must be a valid UUID.", + ) + + +def create_annotation_router() -> APIRouter: + """Create annotation API router.""" + router = APIRouter(prefix="/admin/documents", tags=["Admin Annotations"]) + + # ========================================================================= + # Image Endpoints + # ========================================================================= + + @router.get( + "/{document_id}/images/{page_number}", + responses={ + 401: {"model": ErrorResponse, "description": "Invalid token"}, + 404: {"model": ErrorResponse, "description": "Not found"}, + }, + summary="Get page image", + description="Get the image for a specific page.", + ) + async def get_page_image( + document_id: str, + page_number: int, + admin_token: AdminTokenDep, + db: AdminDBDep, + ) -> FileResponse: + """Get page image.""" + _validate_uuid(document_id, "document_id") + + # Verify ownership + document = db.get_document_by_token(document_id, admin_token) + if document is None: + raise HTTPException( + status_code=404, + detail="Document not found or does not belong to this token", + ) + + # Validate page number + if page_number < 1 or page_number > document.page_count: + raise HTTPException( + status_code=404, + detail=f"Page {page_number} not found. Document has {document.page_count} pages.", + ) + + # Find image file + image_path = ADMIN_IMAGES_DIR / document_id / f"page_{page_number}.png" + if not image_path.exists(): + raise HTTPException( + status_code=404, + detail=f"Image for page {page_number} not found", + ) + + return FileResponse( + path=str(image_path), + media_type="image/png", + filename=f"{document.filename}_page_{page_number}.png", + ) + + # ========================================================================= + # Annotation Endpoints + # ========================================================================= + + @router.get( + "/{document_id}/annotations", + response_model=AnnotationListResponse, + responses={ + 401: {"model": ErrorResponse, "description": "Invalid token"}, + 404: {"model": ErrorResponse, "description": "Document not found"}, + }, + summary="List annotations", + description="Get all annotations for a document.", + ) + async def list_annotations( + document_id: str, + admin_token: AdminTokenDep, + db: AdminDBDep, + page_number: Annotated[ + int | None, + Query(ge=1, description="Filter by page number"), + ] = None, + ) -> AnnotationListResponse: + """List annotations for a document.""" + _validate_uuid(document_id, "document_id") + + # Verify ownership + document = db.get_document_by_token(document_id, admin_token) + if document is None: + raise HTTPException( + status_code=404, + detail="Document not found or does not belong to this token", + ) + + # Get annotations + raw_annotations = db.get_annotations_for_document(document_id, page_number) + annotations = [ + AnnotationItem( + annotation_id=str(ann.annotation_id), + page_number=ann.page_number, + class_id=ann.class_id, + class_name=ann.class_name, + bbox=BoundingBox( + x=ann.bbox_x, + y=ann.bbox_y, + width=ann.bbox_width, + height=ann.bbox_height, + ), + normalized_bbox={ + "x_center": ann.x_center, + "y_center": ann.y_center, + "width": ann.width, + "height": ann.height, + }, + text_value=ann.text_value, + confidence=ann.confidence, + source=AnnotationSource(ann.source), + created_at=ann.created_at, + ) + for ann in raw_annotations + ] + + return AnnotationListResponse( + document_id=document_id, + page_count=document.page_count, + total_annotations=len(annotations), + annotations=annotations, + ) + + @router.post( + "/{document_id}/annotations", + response_model=AnnotationResponse, + responses={ + 400: {"model": ErrorResponse, "description": "Invalid request"}, + 401: {"model": ErrorResponse, "description": "Invalid token"}, + 404: {"model": ErrorResponse, "description": "Document not found"}, + }, + summary="Create annotation", + description="Create a new annotation for a document.", + ) + async def create_annotation( + document_id: str, + request: AnnotationCreate, + admin_token: AdminTokenDep, + db: AdminDBDep, + ) -> AnnotationResponse: + """Create a new annotation.""" + _validate_uuid(document_id, "document_id") + + # Verify ownership + document = db.get_document_by_token(document_id, admin_token) + if document is None: + raise HTTPException( + status_code=404, + detail="Document not found or does not belong to this token", + ) + + # Validate page number + if request.page_number > document.page_count: + raise HTTPException( + status_code=400, + detail=f"Page {request.page_number} exceeds document page count ({document.page_count})", + ) + + # Get image dimensions for normalization + image_path = ADMIN_IMAGES_DIR / document_id / f"page_{request.page_number}.png" + if not image_path.exists(): + raise HTTPException( + status_code=400, + detail=f"Image for page {request.page_number} not available", + ) + + from PIL import Image + with Image.open(image_path) as img: + image_width, image_height = img.size + + # Calculate normalized coordinates + x_center = (request.bbox.x + request.bbox.width / 2) / image_width + y_center = (request.bbox.y + request.bbox.height / 2) / image_height + width = request.bbox.width / image_width + height = request.bbox.height / image_height + + # Get class name + class_name = FIELD_CLASSES.get(request.class_id, f"class_{request.class_id}") + + # Create annotation + annotation_id = db.create_annotation( + document_id=document_id, + page_number=request.page_number, + class_id=request.class_id, + class_name=class_name, + x_center=x_center, + y_center=y_center, + width=width, + height=height, + bbox_x=request.bbox.x, + bbox_y=request.bbox.y, + bbox_width=request.bbox.width, + bbox_height=request.bbox.height, + text_value=request.text_value, + source="manual", + ) + + # Keep status as pending - user must click "Mark Complete" to finalize + # This allows user to add multiple annotations before saving to PostgreSQL + + return AnnotationResponse( + annotation_id=annotation_id, + message="Annotation created successfully", + ) + + @router.patch( + "/{document_id}/annotations/{annotation_id}", + response_model=AnnotationResponse, + responses={ + 400: {"model": ErrorResponse, "description": "Invalid request"}, + 401: {"model": ErrorResponse, "description": "Invalid token"}, + 404: {"model": ErrorResponse, "description": "Not found"}, + }, + summary="Update annotation", + description="Update an existing annotation.", + ) + async def update_annotation( + document_id: str, + annotation_id: str, + request: AnnotationUpdate, + admin_token: AdminTokenDep, + db: AdminDBDep, + ) -> AnnotationResponse: + """Update an annotation.""" + _validate_uuid(document_id, "document_id") + _validate_uuid(annotation_id, "annotation_id") + + # Verify ownership + document = db.get_document_by_token(document_id, admin_token) + if document is None: + raise HTTPException( + status_code=404, + detail="Document not found or does not belong to this token", + ) + + # Get existing annotation + annotation = db.get_annotation(annotation_id) + if annotation is None: + raise HTTPException( + status_code=404, + detail="Annotation not found", + ) + + # Verify annotation belongs to document + if str(annotation.document_id) != document_id: + raise HTTPException( + status_code=404, + detail="Annotation does not belong to this document", + ) + + # Prepare update data + update_kwargs = {} + + if request.class_id is not None: + update_kwargs["class_id"] = request.class_id + update_kwargs["class_name"] = FIELD_CLASSES.get( + request.class_id, f"class_{request.class_id}" + ) + + if request.text_value is not None: + update_kwargs["text_value"] = request.text_value + + if request.bbox is not None: + # Get image dimensions + image_path = ADMIN_IMAGES_DIR / document_id / f"page_{annotation.page_number}.png" + from PIL import Image + with Image.open(image_path) as img: + image_width, image_height = img.size + + # Calculate normalized coordinates + update_kwargs["x_center"] = (request.bbox.x + request.bbox.width / 2) / image_width + update_kwargs["y_center"] = (request.bbox.y + request.bbox.height / 2) / image_height + update_kwargs["width"] = request.bbox.width / image_width + update_kwargs["height"] = request.bbox.height / image_height + update_kwargs["bbox_x"] = request.bbox.x + update_kwargs["bbox_y"] = request.bbox.y + update_kwargs["bbox_width"] = request.bbox.width + update_kwargs["bbox_height"] = request.bbox.height + + # Update annotation + if update_kwargs: + success = db.update_annotation(annotation_id, **update_kwargs) + if not success: + raise HTTPException( + status_code=500, + detail="Failed to update annotation", + ) + + return AnnotationResponse( + annotation_id=annotation_id, + message="Annotation updated successfully", + ) + + @router.delete( + "/{document_id}/annotations/{annotation_id}", + responses={ + 401: {"model": ErrorResponse, "description": "Invalid token"}, + 404: {"model": ErrorResponse, "description": "Not found"}, + }, + summary="Delete annotation", + description="Delete an annotation.", + ) + async def delete_annotation( + document_id: str, + annotation_id: str, + admin_token: AdminTokenDep, + db: AdminDBDep, + ) -> dict: + """Delete an annotation.""" + _validate_uuid(document_id, "document_id") + _validate_uuid(annotation_id, "annotation_id") + + # Verify ownership + document = db.get_document_by_token(document_id, admin_token) + if document is None: + raise HTTPException( + status_code=404, + detail="Document not found or does not belong to this token", + ) + + # Get existing annotation + annotation = db.get_annotation(annotation_id) + if annotation is None: + raise HTTPException( + status_code=404, + detail="Annotation not found", + ) + + # Verify annotation belongs to document + if str(annotation.document_id) != document_id: + raise HTTPException( + status_code=404, + detail="Annotation does not belong to this document", + ) + + # Delete annotation + db.delete_annotation(annotation_id) + + return { + "status": "deleted", + "annotation_id": annotation_id, + "message": "Annotation deleted successfully", + } + + # ========================================================================= + # Auto-Labeling Endpoints + # ========================================================================= + + @router.post( + "/{document_id}/auto-label", + response_model=AutoLabelResponse, + responses={ + 400: {"model": ErrorResponse, "description": "Invalid request"}, + 401: {"model": ErrorResponse, "description": "Invalid token"}, + 404: {"model": ErrorResponse, "description": "Document not found"}, + }, + summary="Trigger auto-labeling", + description="Trigger auto-labeling for a document using field values.", + ) + async def trigger_auto_label( + document_id: str, + request: AutoLabelRequest, + admin_token: AdminTokenDep, + db: AdminDBDep, + ) -> AutoLabelResponse: + """Trigger auto-labeling for a document.""" + _validate_uuid(document_id, "document_id") + + # Verify ownership + document = db.get_document_by_token(document_id, admin_token) + if document is None: + raise HTTPException( + status_code=404, + detail="Document not found or does not belong to this token", + ) + + # Validate field values + if not request.field_values: + raise HTTPException( + status_code=400, + detail="At least one field value is required", + ) + + # Run auto-labeling + service = get_auto_label_service() + result = service.auto_label_document( + document_id=document_id, + file_path=document.file_path, + field_values=request.field_values, + db=db, + replace_existing=request.replace_existing, + ) + + if result["status"] == "failed": + raise HTTPException( + status_code=500, + detail=f"Auto-labeling failed: {result.get('error', 'Unknown error')}", + ) + + return AutoLabelResponse( + document_id=document_id, + status=result["status"], + annotations_created=result["annotations_created"], + message=f"Auto-labeling completed. Created {result['annotations_created']} annotations.", + ) + + @router.delete( + "/{document_id}/annotations", + responses={ + 401: {"model": ErrorResponse, "description": "Invalid token"}, + 404: {"model": ErrorResponse, "description": "Document not found"}, + }, + summary="Delete all annotations", + description="Delete all annotations for a document (optionally filter by source).", + ) + async def delete_all_annotations( + document_id: str, + admin_token: AdminTokenDep, + db: AdminDBDep, + source: Annotated[ + str | None, + Query(description="Filter by source (manual, auto, imported)"), + ] = None, + ) -> dict: + """Delete all annotations for a document.""" + _validate_uuid(document_id, "document_id") + + # Validate source + if source and source not in ("manual", "auto", "imported"): + raise HTTPException( + status_code=400, + detail=f"Invalid source: {source}", + ) + + # Verify ownership + document = db.get_document_by_token(document_id, admin_token) + if document is None: + raise HTTPException( + status_code=404, + detail="Document not found or does not belong to this token", + ) + + # Delete annotations + deleted_count = db.delete_annotations_for_document(document_id, source) + + # Update document status if all annotations deleted + remaining = db.get_annotations_for_document(document_id) + if not remaining: + db.update_document_status(document_id, "pending") + + return { + "status": "deleted", + "document_id": document_id, + "deleted_count": deleted_count, + "message": f"Deleted {deleted_count} annotations", + } + + # ========================================================================= + # Phase 5: Annotation Enhancement + # ========================================================================= + + @router.post( + "/{document_id}/annotations/{annotation_id}/verify", + response_model=AnnotationVerifyResponse, + responses={ + 401: {"model": ErrorResponse, "description": "Invalid token"}, + 404: {"model": ErrorResponse, "description": "Annotation not found"}, + }, + summary="Verify annotation", + description="Mark an annotation as verified by a human reviewer.", + ) + async def verify_annotation( + document_id: str, + annotation_id: str, + admin_token: AdminTokenDep, + db: AdminDBDep, + request: AnnotationVerifyRequest = AnnotationVerifyRequest(), + ) -> AnnotationVerifyResponse: + """Verify an annotation.""" + _validate_uuid(document_id, "document_id") + _validate_uuid(annotation_id, "annotation_id") + + # Verify ownership of document + document = db.get_document_by_token(document_id, admin_token) + if document is None: + raise HTTPException( + status_code=404, + detail="Document not found or does not belong to this token", + ) + + # Verify the annotation + annotation = db.verify_annotation(annotation_id, admin_token) + if annotation is None: + raise HTTPException( + status_code=404, + detail="Annotation not found", + ) + + return AnnotationVerifyResponse( + annotation_id=annotation_id, + is_verified=annotation.is_verified, + verified_at=annotation.verified_at, + verified_by=annotation.verified_by, + message="Annotation verified successfully", + ) + + @router.patch( + "/{document_id}/annotations/{annotation_id}/override", + response_model=AnnotationOverrideResponse, + responses={ + 401: {"model": ErrorResponse, "description": "Invalid token"}, + 404: {"model": ErrorResponse, "description": "Annotation not found"}, + }, + summary="Override annotation", + description="Override an auto-generated annotation with manual corrections.", + ) + async def override_annotation( + document_id: str, + annotation_id: str, + request: AnnotationOverrideRequest, + admin_token: AdminTokenDep, + db: AdminDBDep, + ) -> AnnotationOverrideResponse: + """Override an auto-generated annotation.""" + _validate_uuid(document_id, "document_id") + _validate_uuid(annotation_id, "annotation_id") + + # Verify ownership of document + document = db.get_document_by_token(document_id, admin_token) + if document is None: + raise HTTPException( + status_code=404, + detail="Document not found or does not belong to this token", + ) + + # Build updates dict from request + updates = {} + if request.text_value is not None: + updates["text_value"] = request.text_value + if request.class_id is not None: + updates["class_id"] = request.class_id + # Update class_name if class_id changed + if request.class_id in FIELD_CLASSES: + updates["class_name"] = FIELD_CLASSES[request.class_id] + if request.class_name is not None: + updates["class_name"] = request.class_name + if request.bbox: + # Update bbox fields + if "x" in request.bbox: + updates["bbox_x"] = request.bbox["x"] + if "y" in request.bbox: + updates["bbox_y"] = request.bbox["y"] + if "width" in request.bbox: + updates["bbox_width"] = request.bbox["width"] + if "height" in request.bbox: + updates["bbox_height"] = request.bbox["height"] + + if not updates: + raise HTTPException( + status_code=400, + detail="No updates provided. Specify at least one field to update.", + ) + + # Override the annotation + annotation = db.override_annotation( + annotation_id=annotation_id, + admin_token=admin_token, + change_reason=request.reason, + **updates, + ) + + if annotation is None: + raise HTTPException( + status_code=404, + detail="Annotation not found", + ) + + # Get history to return history_id + history_records = db.get_annotation_history(UUID(annotation_id)) + latest_history = history_records[0] if history_records else None + + return AnnotationOverrideResponse( + annotation_id=annotation_id, + source=annotation.source, + override_source=annotation.override_source, + original_annotation_id=str(annotation.original_annotation_id) if annotation.original_annotation_id else None, + message="Annotation overridden successfully", + history_id=str(latest_history.history_id) if latest_history else "", + ) + + return router diff --git a/src/web/api/v1/admin/auth.py b/src/web/api/v1/admin/auth.py new file mode 100644 index 0000000..daee30f --- /dev/null +++ b/src/web/api/v1/admin/auth.py @@ -0,0 +1,82 @@ +""" +Admin Auth Routes + +FastAPI endpoints for admin token management. +""" + +import logging +import secrets +from datetime import datetime, timedelta + +from fastapi import APIRouter + +from src.web.core.auth import AdminTokenDep, AdminDBDep +from src.web.schemas.admin import ( + AdminTokenCreate, + AdminTokenResponse, +) +from src.web.schemas.common import ErrorResponse + +logger = logging.getLogger(__name__) + + +def create_auth_router() -> APIRouter: + """Create admin auth router.""" + router = APIRouter(prefix="/admin/auth", tags=["Admin Auth"]) + + @router.post( + "/token", + response_model=AdminTokenResponse, + responses={ + 400: {"model": ErrorResponse, "description": "Invalid request"}, + }, + summary="Create admin token", + description="Create a new admin authentication token.", + ) + async def create_token( + request: AdminTokenCreate, + db: AdminDBDep, + ) -> AdminTokenResponse: + """Create a new admin token.""" + # Generate secure token + token = secrets.token_urlsafe(32) + + # Calculate expiration + expires_at = None + if request.expires_in_days: + expires_at = datetime.utcnow() + timedelta(days=request.expires_in_days) + + # Create token in database + db.create_admin_token( + token=token, + name=request.name, + expires_at=expires_at, + ) + + return AdminTokenResponse( + token=token, + name=request.name, + expires_at=expires_at, + message="Admin token created successfully", + ) + + @router.delete( + "/token", + responses={ + 401: {"model": ErrorResponse, "description": "Invalid token"}, + }, + summary="Revoke admin token", + description="Revoke the current admin token.", + ) + async def revoke_token( + admin_token: AdminTokenDep, + db: AdminDBDep, + ) -> dict: + """Revoke the current admin token.""" + db.deactivate_admin_token(admin_token) + return { + "status": "revoked", + "message": "Admin token has been revoked", + } + + return router diff --git a/src/web/api/v1/admin/documents.py b/src/web/api/v1/admin/documents.py new file mode 100644 index 0000000..3d48b2a --- /dev/null +++ b/src/web/api/v1/admin/documents.py @@ -0,0 +1,551 @@ +""" +Admin Document Routes + +FastAPI endpoints for admin document management. +""" + +import logging +from pathlib import Path +from typing import Annotated +from uuid import UUID + +from fastapi import APIRouter, File, HTTPException, Query, UploadFile + +from src.web.config import DEFAULT_DPI, StorageConfig +from src.web.core.auth import AdminTokenDep, AdminDBDep +from src.web.schemas.admin import ( + AnnotationItem, + AnnotationSource, + AutoLabelStatus, + BoundingBox, + DocumentDetailResponse, + DocumentItem, + DocumentListResponse, + DocumentStatus, + DocumentStatsResponse, + DocumentUploadResponse, + ModelMetrics, + TrainingHistoryItem, +) +from src.web.schemas.common import ErrorResponse + +logger = logging.getLogger(__name__) + + +def _validate_uuid(value: str, name: str = "ID") -> None: + """Validate UUID format.""" + try: + UUID(value) + except ValueError: + raise HTTPException( + status_code=400, + detail=f"Invalid {name} format. Must be a valid UUID.", + ) + + +def _convert_pdf_to_images( + document_id: str, content: bytes, page_count: int, images_dir: Path, dpi: int +) -> None: + """Convert PDF pages to images for annotation.""" + import fitz + + doc_images_dir = images_dir / document_id + doc_images_dir.mkdir(parents=True, exist_ok=True) + + pdf_doc = fitz.open(stream=content, filetype="pdf") + + for page_num in range(page_count): + page = pdf_doc[page_num] + # Render at configured DPI for consistency with training + mat = fitz.Matrix(dpi / 72, dpi / 72) + pix = page.get_pixmap(matrix=mat) + + image_path = doc_images_dir / f"page_{page_num + 1}.png" + pix.save(str(image_path)) + + pdf_doc.close() + + +def create_documents_router(storage_config: StorageConfig) -> APIRouter: + """Create admin documents router.""" + router = APIRouter(prefix="/admin/documents", tags=["Admin Documents"]) + + # Directories are created by StorageConfig.__post_init__ + allowed_extensions = storage_config.allowed_extensions + + @router.post( + "", + response_model=DocumentUploadResponse, + responses={ + 400: {"model": ErrorResponse, "description": "Invalid file"}, + 401: {"model": ErrorResponse, "description": "Invalid token"}, + }, + summary="Upload document", + description="Upload a PDF or image document for labeling.", + ) + async def upload_document( + admin_token: AdminTokenDep, + db: AdminDBDep, + file: UploadFile = File(..., description="PDF or image file"), + auto_label: Annotated[ + bool, + Query(description="Trigger auto-labeling after upload"), + ] = True, + ) -> DocumentUploadResponse: + """Upload a document for labeling.""" + # Validate filename + if not file.filename: + raise HTTPException(status_code=400, detail="Filename is required") + + # Validate extension + file_ext = Path(file.filename).suffix.lower() + if file_ext not in allowed_extensions: + raise HTTPException( + status_code=400, + detail=f"Unsupported file type: {file_ext}. " + f"Allowed: {', '.join(allowed_extensions)}", + ) + + # Read file content + try: + content = await file.read() + except Exception as e: + logger.error(f"Failed to read uploaded file: {e}") + raise HTTPException(status_code=400, detail="Failed to read file") + + # Get page count (for PDF) + page_count = 1 + if file_ext == ".pdf": + try: + import fitz + pdf_doc = fitz.open(stream=content, filetype="pdf") + page_count = len(pdf_doc) + pdf_doc.close() + except Exception as e: + logger.warning(f"Failed to get PDF page count: {e}") + + # Create document record (token only used for auth, not stored) + document_id = db.create_document( + filename=file.filename, + file_size=len(content), + content_type=file.content_type or "application/octet-stream", + file_path="", # Will update after saving + page_count=page_count, + ) + + # Save file to admin uploads + file_path = storage_config.admin_upload_dir / f"{document_id}{file_ext}" + try: + file_path.write_bytes(content) + except Exception as e: + logger.error(f"Failed to save file: {e}") + raise HTTPException(status_code=500, detail="Failed to save file") + + # Update file path in database + from src.data.database import get_session_context + from src.data.admin_models import AdminDocument + with get_session_context() as session: + doc = session.get(AdminDocument, UUID(document_id)) + if doc: + doc.file_path = str(file_path) + session.add(doc) + + # Convert PDF to images for annotation + if file_ext == ".pdf": + try: + _convert_pdf_to_images( + document_id, content, page_count, + storage_config.admin_images_dir, storage_config.dpi + ) + except Exception as e: + logger.error(f"Failed to convert PDF to images: {e}") + + # Trigger auto-labeling if requested + auto_label_started = False + if auto_label: + # Auto-labeling will be triggered by a background task + db.update_document_status( + document_id=document_id, + status="auto_labeling", + auto_label_status="running", + ) + auto_label_started = True + + return DocumentUploadResponse( + document_id=document_id, + filename=file.filename, + file_size=len(content), + page_count=page_count, + status=DocumentStatus.AUTO_LABELING if auto_label_started else DocumentStatus.PENDING, + auto_label_started=auto_label_started, + message="Document uploaded successfully", + ) + + @router.get( + "", + response_model=DocumentListResponse, + responses={ + 401: {"model": ErrorResponse, "description": "Invalid token"}, + }, + summary="List documents", + description="List all documents for the current admin.", + ) + async def list_documents( + admin_token: AdminTokenDep, + db: AdminDBDep, + status: Annotated[ + str | None, + Query(description="Filter by status"), + ] = None, + upload_source: Annotated[ + str | None, + Query(description="Filter by upload source (ui or api)"), + ] = None, + has_annotations: Annotated[ + bool | None, + Query(description="Filter by annotation presence"), + ] = None, + auto_label_status: Annotated[ + str | None, + Query(description="Filter by auto-label status"), + ] = None, + batch_id: Annotated[ + str | None, + Query(description="Filter by batch ID"), + ] = None, + limit: Annotated[ + int, + Query(ge=1, le=100, description="Page size"), + ] = 20, + offset: Annotated[ + int, + Query(ge=0, description="Offset"), + ] = 0, + ) -> DocumentListResponse: + """List documents.""" + # Validate status + if status and status not in ("pending", "auto_labeling", "labeled", "exported"): + raise HTTPException( + status_code=400, + detail=f"Invalid status: {status}", + ) + + # Validate upload_source + if upload_source and upload_source not in ("ui", "api"): + raise HTTPException( + status_code=400, + detail=f"Invalid upload_source: {upload_source}", + ) + + # Validate auto_label_status + if auto_label_status and auto_label_status not in ("pending", "running", "completed", "failed"): + raise HTTPException( + status_code=400, + detail=f"Invalid auto_label_status: {auto_label_status}", + ) + + documents, total = db.get_documents_by_token( + admin_token=admin_token, + status=status, + upload_source=upload_source, + has_annotations=has_annotations, + auto_label_status=auto_label_status, + batch_id=batch_id, + limit=limit, + offset=offset, + ) + + # Get annotation counts and build items + items = [] + for doc in documents: + annotations = db.get_annotations_for_document(str(doc.document_id)) + + # Determine if document can be annotated (not locked) + can_annotate = True + if hasattr(doc, 'annotation_lock_until') and doc.annotation_lock_until: + from datetime import datetime, timezone + can_annotate = doc.annotation_lock_until < datetime.now(timezone.utc) + + items.append( + DocumentItem( + document_id=str(doc.document_id), + filename=doc.filename, + file_size=doc.file_size, + page_count=doc.page_count, + status=DocumentStatus(doc.status), + auto_label_status=AutoLabelStatus(doc.auto_label_status) if doc.auto_label_status else None, + annotation_count=len(annotations), + upload_source=doc.upload_source if hasattr(doc, 'upload_source') else "ui", + batch_id=str(doc.batch_id) if hasattr(doc, 'batch_id') and doc.batch_id else None, + can_annotate=can_annotate, + created_at=doc.created_at, + updated_at=doc.updated_at, + ) + ) + + return DocumentListResponse( + total=total, + limit=limit, + offset=offset, + documents=items, + ) + + @router.get( + "/stats", + response_model=DocumentStatsResponse, + responses={ + 401: {"model": ErrorResponse, "description": "Invalid token"}, + }, + summary="Get document statistics", + description="Get document count by status.", + ) + async def get_document_stats( + admin_token: AdminTokenDep, + db: AdminDBDep, + ) -> DocumentStatsResponse: + """Get document statistics.""" + counts = db.count_documents_by_status(admin_token) + + return DocumentStatsResponse( + total=sum(counts.values()), + pending=counts.get("pending", 0), + auto_labeling=counts.get("auto_labeling", 0), + labeled=counts.get("labeled", 0), + exported=counts.get("exported", 0), + ) + + @router.get( + "/{document_id}", + response_model=DocumentDetailResponse, + responses={ + 401: {"model": ErrorResponse, "description": "Invalid token"}, + 404: {"model": ErrorResponse, "description": "Document not found"}, + }, + summary="Get document detail", + description="Get document details with annotations.", + ) + async def get_document( + document_id: str, + admin_token: AdminTokenDep, + db: AdminDBDep, + ) -> DocumentDetailResponse: + """Get document details.""" + _validate_uuid(document_id, "document_id") + + document = db.get_document_by_token(document_id, admin_token) + if document is None: + raise HTTPException( + status_code=404, + detail="Document not found or does not belong to this token", + ) + + # Get annotations + raw_annotations = db.get_annotations_for_document(document_id) + annotations = [ + AnnotationItem( + annotation_id=str(ann.annotation_id), + page_number=ann.page_number, + class_id=ann.class_id, + class_name=ann.class_name, + bbox=BoundingBox( + x=ann.bbox_x, + y=ann.bbox_y, + width=ann.bbox_width, + height=ann.bbox_height, + ), + normalized_bbox={ + "x_center": ann.x_center, + "y_center": ann.y_center, + "width": ann.width, + "height": ann.height, + }, + text_value=ann.text_value, + confidence=ann.confidence, + source=AnnotationSource(ann.source), + created_at=ann.created_at, + ) + for ann in raw_annotations + ] + + # Generate image URLs + image_urls = [] + for page in range(1, document.page_count + 1): + image_urls.append(f"/api/v1/admin/documents/{document_id}/images/{page}") + + # Determine if document can be annotated (not locked) + can_annotate = True + annotation_lock_until = None + if hasattr(document, 'annotation_lock_until') and document.annotation_lock_until: + from datetime import datetime, timezone + annotation_lock_until = document.annotation_lock_until + can_annotate = document.annotation_lock_until < datetime.now(timezone.utc) + + # Get CSV field values if available + csv_field_values = None + if hasattr(document, 'csv_field_values') and document.csv_field_values: + csv_field_values = document.csv_field_values + + # Get training history (Phase 5) + training_history = [] + training_links = db.get_document_training_tasks(document.document_id) + for link in training_links: + # Get task details + task = db.get_training_task(str(link.task_id)) + if task: + # Build metrics + metrics = None + if task.metrics_mAP or task.metrics_precision or task.metrics_recall: + metrics = ModelMetrics( + mAP=task.metrics_mAP, + precision=task.metrics_precision, + recall=task.metrics_recall, + ) + + training_history.append( + TrainingHistoryItem( + task_id=str(link.task_id), + name=task.name, + trained_at=link.created_at, + model_metrics=metrics, + ) + ) + + return DocumentDetailResponse( + document_id=str(document.document_id), + filename=document.filename, + file_size=document.file_size, + content_type=document.content_type, + page_count=document.page_count, + status=DocumentStatus(document.status), + auto_label_status=AutoLabelStatus(document.auto_label_status) if document.auto_label_status else None, + auto_label_error=document.auto_label_error, + upload_source=document.upload_source if hasattr(document, 'upload_source') else "ui", + batch_id=str(document.batch_id) if hasattr(document, 'batch_id') and document.batch_id else None, + csv_field_values=csv_field_values, + can_annotate=can_annotate, + annotation_lock_until=annotation_lock_until, + annotations=annotations, + image_urls=image_urls, + training_history=training_history, + created_at=document.created_at, + updated_at=document.updated_at, + ) + + @router.delete( + "/{document_id}", + responses={ + 401: {"model": ErrorResponse, "description": "Invalid token"}, + 404: {"model": ErrorResponse, "description": "Document not found"}, + }, + summary="Delete document", + description="Delete a document and its annotations.", + ) + async def delete_document( + document_id: str, + admin_token: AdminTokenDep, + db: AdminDBDep, + ) -> dict: + """Delete a document.""" + _validate_uuid(document_id, "document_id") + + # Verify ownership + document = db.get_document_by_token(document_id, admin_token) + if document is None: + raise HTTPException( + status_code=404, + detail="Document not found or does not belong to this token", + ) + + # Delete file + file_path = Path(document.file_path) + if file_path.exists(): + file_path.unlink() + + # Delete images + images_dir = ADMIN_IMAGES_DIR / document_id + if images_dir.exists(): + import shutil + shutil.rmtree(images_dir) + + # Delete from database + db.delete_document(document_id) + + return { + "status": "deleted", + "document_id": document_id, + "message": "Document deleted successfully", + } + + @router.patch( + "/{document_id}/status", + responses={ + 401: {"model": ErrorResponse, "description": "Invalid token"}, + 404: {"model": ErrorResponse, "description": "Document not found"}, + }, + summary="Update document status", + description="Update document status (e.g., mark as labeled). When marking as 'labeled', annotations are saved to PostgreSQL.", + ) + async def update_document_status( + document_id: str, + admin_token: AdminTokenDep, + db: AdminDBDep, + status: Annotated[ + str, + Query(description="New status"), + ], + ) -> dict: + """Update document status. + + When status is set to 'labeled', the annotations are automatically + saved to PostgreSQL documents/field_results tables for consistency + with CLI auto-label workflow. + """ + _validate_uuid(document_id, "document_id") + + # Validate status + if status not in ("pending", "labeled", "exported"): + raise HTTPException( + status_code=400, + detail=f"Invalid status: {status}", + ) + + # Verify ownership + document = db.get_document_by_token(document_id, admin_token) + if document is None: + raise HTTPException( + status_code=404, + detail="Document not found or does not belong to this token", + ) + + # If marking as labeled, save annotations to PostgreSQL DocumentDB + db_save_result = None + if status == "labeled": + from src.web.services.db_autolabel import save_manual_annotations_to_document_db + + # Get all annotations for this document + annotations = db.get_annotations_for_document(document_id) + + if annotations: + db_save_result = save_manual_annotations_to_document_db( + document=document, + annotations=annotations, + db=db, + ) + + db.update_document_status(document_id, status) + + response = { + "status": "updated", + "document_id": document_id, + "new_status": status, + "message": "Document status updated", + } + + # Include PostgreSQL save result if applicable + if db_save_result: + response["document_db_saved"] = db_save_result.get("success", False) + response["fields_saved"] = db_save_result.get("fields_saved", 0) + + return response + + return router diff --git a/src/web/api/v1/admin/locks.py b/src/web/api/v1/admin/locks.py new file mode 100644 index 0000000..1b5f46e --- /dev/null +++ b/src/web/api/v1/admin/locks.py @@ -0,0 +1,184 @@ +""" +Admin Document Lock Routes + +FastAPI endpoints for annotation lock management. +""" + +import logging +from typing import Annotated +from uuid import UUID + +from fastapi import APIRouter, HTTPException, Query + +from src.web.core.auth import AdminTokenDep, AdminDBDep +from src.web.schemas.admin import ( + AnnotationLockRequest, + AnnotationLockResponse, +) +from src.web.schemas.common import ErrorResponse + +logger = logging.getLogger(__name__) + + +def _validate_uuid(value: str, name: str = "ID") -> None: + """Validate UUID format.""" + try: + UUID(value) + except ValueError: + raise HTTPException( + status_code=400, + detail=f"Invalid {name} format. Must be a valid UUID.", + ) + + +def create_locks_router() -> APIRouter: + """Create annotation locks router.""" + router = APIRouter(prefix="/admin/documents", tags=["Admin Locks"]) + + @router.post( + "/{document_id}/lock", + response_model=AnnotationLockResponse, + responses={ + 401: {"model": ErrorResponse, "description": "Invalid token"}, + 404: {"model": ErrorResponse, "description": "Document not found"}, + 409: {"model": ErrorResponse, "description": "Document already locked"}, + }, + summary="Acquire annotation lock", + description="Acquire a lock on a document to prevent concurrent annotation edits.", + ) + async def acquire_lock( + document_id: str, + admin_token: AdminTokenDep, + db: AdminDBDep, + request: AnnotationLockRequest = AnnotationLockRequest(), + ) -> AnnotationLockResponse: + """Acquire annotation lock for a document.""" + _validate_uuid(document_id, "document_id") + + # Verify ownership + document = db.get_document_by_token(document_id, admin_token) + if document is None: + raise HTTPException( + status_code=404, + detail="Document not found or does not belong to this token", + ) + + # Attempt to acquire lock + updated_doc = db.acquire_annotation_lock( + document_id=document_id, + admin_token=admin_token, + duration_seconds=request.duration_seconds, + ) + + if updated_doc is None: + raise HTTPException( + status_code=409, + detail="Document is already locked. Please try again later.", + ) + + return AnnotationLockResponse( + document_id=document_id, + locked=True, + lock_expires_at=updated_doc.annotation_lock_until, + message=f"Lock acquired for {request.duration_seconds} seconds", + ) + + @router.delete( + "/{document_id}/lock", + response_model=AnnotationLockResponse, + responses={ + 401: {"model": ErrorResponse, "description": "Invalid token"}, + 404: {"model": ErrorResponse, "description": "Document not found"}, + }, + summary="Release annotation lock", + description="Release the annotation lock on a document.", + ) + async def release_lock( + document_id: str, + admin_token: AdminTokenDep, + db: AdminDBDep, + force: Annotated[ + bool, + Query(description="Force release (admin override)"), + ] = False, + ) -> AnnotationLockResponse: + """Release annotation lock for a document.""" + _validate_uuid(document_id, "document_id") + + # Verify ownership + document = db.get_document_by_token(document_id, admin_token) + if document is None: + raise HTTPException( + status_code=404, + detail="Document not found or does not belong to this token", + ) + + # Release lock + updated_doc = db.release_annotation_lock( + document_id=document_id, + admin_token=admin_token, + force=force, + ) + + if updated_doc is None: + raise HTTPException( + status_code=404, + detail="Failed to release lock", + ) + + return AnnotationLockResponse( + document_id=document_id, + locked=False, + lock_expires_at=None, + message="Lock released successfully", + ) + + @router.patch( + "/{document_id}/lock", + response_model=AnnotationLockResponse, + responses={ + 401: {"model": ErrorResponse, "description": "Invalid token"}, + 404: {"model": ErrorResponse, "description": "Document not found"}, + 409: {"model": ErrorResponse, "description": "Lock expired or doesn't exist"}, + }, + summary="Extend annotation lock", + description="Extend an existing annotation lock.", + ) + async def extend_lock( + document_id: str, + admin_token: AdminTokenDep, + db: AdminDBDep, + request: AnnotationLockRequest = AnnotationLockRequest(), + ) -> AnnotationLockResponse: + """Extend annotation lock for a document.""" + _validate_uuid(document_id, "document_id") + + # Verify ownership + document = db.get_document_by_token(document_id, admin_token) + if document is None: + raise HTTPException( + status_code=404, + detail="Document not found or does not belong to this token", + ) + + # Attempt to extend lock + updated_doc = db.extend_annotation_lock( + document_id=document_id, + admin_token=admin_token, + additional_seconds=request.duration_seconds, + ) + + if updated_doc is None: + raise HTTPException( + status_code=409, + detail="Lock doesn't exist or has expired. Please acquire a new lock.", + ) + + return AnnotationLockResponse( + document_id=document_id, + locked=True, + lock_expires_at=updated_doc.annotation_lock_until, + message=f"Lock extended by {request.duration_seconds} seconds", + ) + + return router diff --git a/src/web/api/v1/admin/training.py b/src/web/api/v1/admin/training.py new file mode 100644 index 0000000..515f023 --- /dev/null +++ b/src/web/api/v1/admin/training.py @@ -0,0 +1,622 @@ +""" +Admin Training API Routes + +FastAPI endpoints for training task management and scheduling. +""" + +import logging +from datetime import datetime +from typing import Annotated, Any +from uuid import UUID + +from fastapi import APIRouter, HTTPException, Query + +from src.data.admin_db import AdminDB +from src.web.core.auth import AdminTokenDep, AdminDBDep +from src.web.schemas.admin import ( + ExportRequest, + ExportResponse, + ModelMetrics, + TrainingConfig, + TrainingDocumentItem, + TrainingDocumentsResponse, + TrainingHistoryItem, + TrainingLogItem, + TrainingLogsResponse, + TrainingModelItem, + TrainingModelsResponse, + TrainingStatus, + TrainingTaskCreate, + TrainingTaskDetailResponse, + TrainingTaskItem, + TrainingTaskListResponse, + TrainingTaskResponse, + TrainingType, +) +from src.web.schemas.common import ErrorResponse + +logger = logging.getLogger(__name__) + + +def _validate_uuid(value: str, name: str = "ID") -> None: + """Validate UUID format.""" + try: + UUID(value) + except ValueError: + raise HTTPException( + status_code=400, + detail=f"Invalid {name} format. Must be a valid UUID.", + ) + + +def create_training_router() -> APIRouter: + """Create training API router.""" + router = APIRouter(prefix="/admin/training", tags=["Admin Training"]) + + # ========================================================================= + # Training Task Endpoints + # ========================================================================= + + @router.post( + "/tasks", + response_model=TrainingTaskResponse, + responses={ + 400: {"model": ErrorResponse, "description": "Invalid request"}, + 401: {"model": ErrorResponse, "description": "Invalid token"}, + }, + summary="Create training task", + description="Create a new training task.", + ) + async def create_training_task( + request: TrainingTaskCreate, + admin_token: AdminTokenDep, + db: AdminDBDep, + ) -> TrainingTaskResponse: + """Create a new training task.""" + # Convert config to dict + config_dict = request.config.model_dump() if request.config else {} + + # Create task + task_id = db.create_training_task( + admin_token=admin_token, + name=request.name, + task_type=request.task_type.value, + description=request.description, + config=config_dict, + scheduled_at=request.scheduled_at, + cron_expression=request.cron_expression, + is_recurring=bool(request.cron_expression), + ) + + return TrainingTaskResponse( + task_id=task_id, + status=TrainingStatus.SCHEDULED if request.scheduled_at else TrainingStatus.PENDING, + message="Training task created successfully", + ) + + @router.get( + "/tasks", + response_model=TrainingTaskListResponse, + responses={ + 401: {"model": ErrorResponse, "description": "Invalid token"}, + }, + summary="List training tasks", + description="List all training tasks.", + ) + async def list_training_tasks( + admin_token: AdminTokenDep, + db: AdminDBDep, + status: Annotated[ + str | None, + Query(description="Filter by status"), + ] = None, + limit: Annotated[ + int, + Query(ge=1, le=100, description="Page size"), + ] = 20, + offset: Annotated[ + int, + Query(ge=0, description="Offset"), + ] = 0, + ) -> TrainingTaskListResponse: + """List training tasks.""" + # Validate status + valid_statuses = ("pending", "scheduled", "running", "completed", "failed", "cancelled") + if status and status not in valid_statuses: + raise HTTPException( + status_code=400, + detail=f"Invalid status: {status}. Must be one of: {', '.join(valid_statuses)}", + ) + + tasks, total = db.get_training_tasks_by_token( + admin_token=admin_token, + status=status, + limit=limit, + offset=offset, + ) + + items = [ + TrainingTaskItem( + task_id=str(task.task_id), + name=task.name, + task_type=TrainingType(task.task_type), + status=TrainingStatus(task.status), + scheduled_at=task.scheduled_at, + is_recurring=task.is_recurring, + started_at=task.started_at, + completed_at=task.completed_at, + created_at=task.created_at, + ) + for task in tasks + ] + + return TrainingTaskListResponse( + total=total, + limit=limit, + offset=offset, + tasks=items, + ) + + @router.get( + "/tasks/{task_id}", + response_model=TrainingTaskDetailResponse, + responses={ + 401: {"model": ErrorResponse, "description": "Invalid token"}, + 404: {"model": ErrorResponse, "description": "Task not found"}, + }, + summary="Get training task detail", + description="Get training task details.", + ) + async def get_training_task( + task_id: str, + admin_token: AdminTokenDep, + db: AdminDBDep, + ) -> TrainingTaskDetailResponse: + """Get training task details.""" + _validate_uuid(task_id, "task_id") + + task = db.get_training_task_by_token(task_id, admin_token) + if task is None: + raise HTTPException( + status_code=404, + detail="Training task not found or does not belong to this token", + ) + + return TrainingTaskDetailResponse( + task_id=str(task.task_id), + name=task.name, + description=task.description, + task_type=TrainingType(task.task_type), + status=TrainingStatus(task.status), + config=task.config, + scheduled_at=task.scheduled_at, + cron_expression=task.cron_expression, + is_recurring=task.is_recurring, + started_at=task.started_at, + completed_at=task.completed_at, + error_message=task.error_message, + result_metrics=task.result_metrics, + model_path=task.model_path, + created_at=task.created_at, + ) + + @router.post( + "/tasks/{task_id}/cancel", + response_model=TrainingTaskResponse, + responses={ + 401: {"model": ErrorResponse, "description": "Invalid token"}, + 404: {"model": ErrorResponse, "description": "Task not found"}, + 409: {"model": ErrorResponse, "description": "Cannot cancel task"}, + }, + summary="Cancel training task", + description="Cancel a pending or scheduled training task.", + ) + async def cancel_training_task( + task_id: str, + admin_token: AdminTokenDep, + db: AdminDBDep, + ) -> TrainingTaskResponse: + """Cancel a training task.""" + _validate_uuid(task_id, "task_id") + + # Verify ownership + task = db.get_training_task_by_token(task_id, admin_token) + if task is None: + raise HTTPException( + status_code=404, + detail="Training task not found or does not belong to this token", + ) + + # Check if can be cancelled + if task.status not in ("pending", "scheduled"): + raise HTTPException( + status_code=409, + detail=f"Cannot cancel task with status: {task.status}", + ) + + # Cancel task + success = db.cancel_training_task(task_id) + if not success: + raise HTTPException( + status_code=500, + detail="Failed to cancel training task", + ) + + return TrainingTaskResponse( + task_id=task_id, + status=TrainingStatus.CANCELLED, + message="Training task cancelled successfully", + ) + + @router.get( + "/tasks/{task_id}/logs", + response_model=TrainingLogsResponse, + responses={ + 401: {"model": ErrorResponse, "description": "Invalid token"}, + 404: {"model": ErrorResponse, "description": "Task not found"}, + }, + summary="Get training logs", + description="Get training task logs.", + ) + async def get_training_logs( + task_id: str, + admin_token: AdminTokenDep, + db: AdminDBDep, + limit: Annotated[ + int, + Query(ge=1, le=500, description="Maximum logs to return"), + ] = 100, + offset: Annotated[ + int, + Query(ge=0, description="Offset"), + ] = 0, + ) -> TrainingLogsResponse: + """Get training logs.""" + _validate_uuid(task_id, "task_id") + + # Verify ownership + task = db.get_training_task_by_token(task_id, admin_token) + if task is None: + raise HTTPException( + status_code=404, + detail="Training task not found or does not belong to this token", + ) + + # Get logs + logs = db.get_training_logs(task_id, limit, offset) + + items = [ + TrainingLogItem( + level=log.level, + message=log.message, + details=log.details, + created_at=log.created_at, + ) + for log in logs + ] + + return TrainingLogsResponse( + task_id=task_id, + logs=items, + ) + + # ========================================================================= + # Phase 4: Training Data Management + # ========================================================================= + + @router.get( + "/documents", + response_model=TrainingDocumentsResponse, + responses={ + 401: {"model": ErrorResponse, "description": "Invalid token"}, + }, + summary="Get documents for training", + description="Get labeled documents available for training with filtering options.", + ) + async def get_training_documents( + admin_token: AdminTokenDep, + db: AdminDBDep, + has_annotations: Annotated[ + bool, + Query(description="Only include documents with annotations"), + ] = True, + min_annotation_count: Annotated[ + int | None, + Query(ge=1, description="Minimum annotation count"), + ] = None, + exclude_used_in_training: Annotated[ + bool, + Query(description="Exclude documents already used in training"), + ] = False, + limit: Annotated[ + int, + Query(ge=1, le=100, description="Page size"), + ] = 100, + offset: Annotated[ + int, + Query(ge=0, description="Offset"), + ] = 0, + ) -> TrainingDocumentsResponse: + """Get documents available for training.""" + # Get documents + documents, total = db.get_documents_for_training( + admin_token=admin_token, + status="labeled", + has_annotations=has_annotations, + min_annotation_count=min_annotation_count, + exclude_used_in_training=exclude_used_in_training, + limit=limit, + offset=offset, + ) + + # Build response items with annotation details and training history + items = [] + for doc in documents: + # Get annotations for this document + annotations = db.get_annotations_for_document(str(doc.document_id)) + + # Count annotations by source + sources = {"manual": 0, "auto": 0} + for ann in annotations: + if ann.source in sources: + sources[ann.source] += 1 + + # Get training history + training_links = db.get_document_training_tasks(doc.document_id) + used_in_training = [str(link.task_id) for link in training_links] + + items.append( + TrainingDocumentItem( + document_id=str(doc.document_id), + filename=doc.filename, + annotation_count=len(annotations), + annotation_sources=sources, + used_in_training=used_in_training, + last_modified=doc.updated_at, + ) + ) + + return TrainingDocumentsResponse( + total=total, + limit=limit, + offset=offset, + documents=items, + ) + + @router.get( + "/models/{task_id}/download", + responses={ + 401: {"model": ErrorResponse, "description": "Invalid token"}, + 404: {"model": ErrorResponse, "description": "Model not found"}, + }, + summary="Download trained model", + description="Download trained model weights file.", + ) + async def download_model( + task_id: str, + admin_token: AdminTokenDep, + db: AdminDBDep, + ): + """Download trained model.""" + from fastapi.responses import FileResponse + from pathlib import Path + + _validate_uuid(task_id, "task_id") + + # Verify ownership + task = db.get_training_task_by_token(task_id, admin_token) + if task is None: + raise HTTPException( + status_code=404, + detail="Training task not found or does not belong to this token", + ) + + # Check if model exists + if not task.model_path: + raise HTTPException( + status_code=404, + detail="Model file not available for this task", + ) + + model_path = Path(task.model_path) + if not model_path.exists(): + raise HTTPException( + status_code=404, + detail="Model file not found on disk", + ) + + return FileResponse( + path=str(model_path), + media_type="application/octet-stream", + filename=f"{task.name}_model.pt", + ) + + @router.get( + "/models", + response_model=TrainingModelsResponse, + responses={ + 401: {"model": ErrorResponse, "description": "Invalid token"}, + }, + summary="Get trained models", + description="Get list of trained models with metrics and download links.", + ) + async def get_training_models( + admin_token: AdminTokenDep, + db: AdminDBDep, + status: Annotated[ + str | None, + Query(description="Filter by status (completed, failed, etc.)"), + ] = None, + limit: Annotated[ + int, + Query(ge=1, le=100, description="Page size"), + ] = 20, + offset: Annotated[ + int, + Query(ge=0, description="Offset"), + ] = 0, + ) -> TrainingModelsResponse: + """Get list of trained models.""" + # Get training tasks + tasks, total = db.get_training_tasks_by_token( + admin_token=admin_token, + status=status if status else "completed", + limit=limit, + offset=offset, + ) + + # Build response items + items = [] + for task in tasks: + # Build metrics + metrics = ModelMetrics( + mAP=task.metrics_mAP, + precision=task.metrics_precision, + recall=task.metrics_recall, + ) + + # Build download URL if model exists + download_url = None + if task.model_path and task.status == "completed": + download_url = f"/api/v1/admin/training/models/{task.task_id}/download" + + items.append( + TrainingModelItem( + task_id=str(task.task_id), + name=task.name, + status=TrainingStatus(task.status), + document_count=task.document_count, + created_at=task.created_at, + completed_at=task.completed_at, + metrics=metrics, + model_path=task.model_path, + download_url=download_url, + ) + ) + + return TrainingModelsResponse( + total=total, + limit=limit, + offset=offset, + models=items, + ) + + # ========================================================================= + # Export Endpoints + # ========================================================================= + + @router.post( + "/export", + response_model=ExportResponse, + responses={ + 400: {"model": ErrorResponse, "description": "Invalid request"}, + 401: {"model": ErrorResponse, "description": "Invalid token"}, + }, + summary="Export annotations", + description="Export annotations in YOLO format for training.", + ) + async def export_annotations( + request: ExportRequest, + admin_token: AdminTokenDep, + db: AdminDBDep, + ) -> ExportResponse: + """Export annotations for training.""" + from pathlib import Path + import shutil + + # Validate format + if request.format not in ("yolo", "coco", "voc"): + raise HTTPException( + status_code=400, + detail=f"Unsupported export format: {request.format}", + ) + + # Get labeled documents + documents = db.get_labeled_documents_for_export(admin_token) + + if not documents: + raise HTTPException( + status_code=400, + detail="No labeled documents available for export", + ) + + # Create export directory + export_dir = Path("data/exports") / f"export_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}" + export_dir.mkdir(parents=True, exist_ok=True) + + # YOLO format directories + (export_dir / "images" / "train").mkdir(parents=True, exist_ok=True) + (export_dir / "images" / "val").mkdir(parents=True, exist_ok=True) + (export_dir / "labels" / "train").mkdir(parents=True, exist_ok=True) + (export_dir / "labels" / "val").mkdir(parents=True, exist_ok=True) + + # Calculate train/val split + total_docs = len(documents) + train_count = int(total_docs * request.split_ratio) + train_docs = documents[:train_count] + val_docs = documents[train_count:] + + total_images = 0 + total_annotations = 0 + + # Export documents + for split, docs in [("train", train_docs), ("val", val_docs)]: + for doc in docs: + # Get annotations + annotations = db.get_annotations_for_document(str(doc.document_id)) + + if not annotations: + continue + + # Export each page + for page_num in range(1, doc.page_count + 1): + page_annotations = [a for a in annotations if a.page_number == page_num] + + if not page_annotations and not request.include_images: + continue + + # Copy image + src_image = Path("data/admin_images") / str(doc.document_id) / f"page_{page_num}.png" + if not src_image.exists(): + continue + + image_name = f"{doc.document_id}_page{page_num}.png" + dst_image = export_dir / "images" / split / image_name + shutil.copy(src_image, dst_image) + total_images += 1 + + # Write YOLO label file + label_name = f"{doc.document_id}_page{page_num}.txt" + label_path = export_dir / "labels" / split / label_name + + with open(label_path, "w") as f: + for ann in page_annotations: + # YOLO format: class_id x_center y_center width height + line = f"{ann.class_id} {ann.x_center:.6f} {ann.y_center:.6f} {ann.width:.6f} {ann.height:.6f}\n" + f.write(line) + total_annotations += 1 + + # Create data.yaml + from src.data.admin_models import FIELD_CLASSES + + yaml_content = f"""# Auto-generated YOLO dataset config +path: {export_dir.absolute()} +train: images/train +val: images/val + +nc: {len(FIELD_CLASSES)} +names: {list(FIELD_CLASSES.values())} +""" + (export_dir / "data.yaml").write_text(yaml_content) + + return ExportResponse( + status="completed", + export_path=str(export_dir), + total_images=total_images, + total_annotations=total_annotations, + train_count=len(train_docs), + val_count=len(val_docs), + message=f"Exported {total_images} images with {total_annotations} annotations", + ) + + return router diff --git a/src/web/api/v1/batch/__init__.py b/src/web/api/v1/batch/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/web/api/v1/batch/routes.py b/src/web/api/v1/batch/routes.py new file mode 100644 index 0000000..c97819c --- /dev/null +++ b/src/web/api/v1/batch/routes.py @@ -0,0 +1,236 @@ +""" +Batch Upload API Routes + +Endpoints for batch uploading documents via ZIP files with CSV metadata. +""" + +import io +import logging +import zipfile +from datetime import datetime +from typing import Annotated +from uuid import UUID + +from fastapi import APIRouter, Depends, File, HTTPException, UploadFile, Form +from fastapi.responses import JSONResponse + +from src.data.admin_db import AdminDB +from src.web.core.auth import validate_admin_token, get_admin_db +from src.web.services.batch_upload import BatchUploadService, MAX_COMPRESSED_SIZE, MAX_UNCOMPRESSED_SIZE +from src.web.workers.batch_queue import BatchTask, get_batch_queue + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/api/v1/admin/batch", tags=["batch-upload"]) + + +@router.post("/upload") +async def upload_batch( + file: UploadFile = File(...), + upload_source: str = Form(default="ui"), + async_mode: bool = Form(default=True), + auto_label: bool = Form(default=True), + admin_token: Annotated[str, Depends(validate_admin_token)] = None, + admin_db: Annotated[AdminDB, Depends(get_admin_db)] = None, +) -> dict: + """Upload a batch of documents via ZIP file. + + The ZIP file can contain: + - Multiple PDF files + - Optional CSV file with field values for auto-labeling + + CSV format: + - Required column: DocumentId (matches PDF filename without extension) + - Optional columns: InvoiceNumber, InvoiceDate, InvoiceDueDate, Amount, + OCR, Bankgiro, Plusgiro, customer_number, supplier_organisation_number + + Args: + file: ZIP file upload + upload_source: Upload source (ui or api) + admin_token: Admin authentication token + admin_db: Admin database interface + + Returns: + Batch upload result with batch_id and status + """ + if not file.filename.lower().endswith('.zip'): + raise HTTPException(status_code=400, detail="Only ZIP files are supported") + + # Check compressed size + if file.size and file.size > MAX_COMPRESSED_SIZE: + max_mb = MAX_COMPRESSED_SIZE / (1024 * 1024) + raise HTTPException( + status_code=400, + detail=f"File size exceeds {max_mb:.0f}MB limit" + ) + + try: + # Read file content + zip_content = await file.read() + + # Additional security validation before processing + try: + with zipfile.ZipFile(io.BytesIO(zip_content)) as test_zip: + # Quick validation of ZIP structure + test_zip.testzip() + except zipfile.BadZipFile: + raise HTTPException(status_code=400, detail="Invalid ZIP file format") + + if async_mode: + # Async mode: Queue task and return immediately + from uuid import uuid4 + + batch_id = uuid4() + + # Create batch task for background processing + task = BatchTask( + batch_id=batch_id, + admin_token=admin_token, + zip_content=zip_content, + zip_filename=file.filename, + upload_source=upload_source, + auto_label=auto_label, + created_at=datetime.utcnow(), + ) + + # Submit to queue + queue = get_batch_queue() + if not queue.submit(task): + raise HTTPException( + status_code=503, + detail="Processing queue is full. Please try again later." + ) + + logger.info( + f"Batch upload queued: batch_id={batch_id}, " + f"filename={file.filename}, async_mode=True" + ) + + # Return 202 Accepted with batch_id and status URL + return JSONResponse( + status_code=202, + content={ + "status": "accepted", + "batch_id": str(batch_id), + "message": "Batch upload queued for processing", + "status_url": f"/api/v1/admin/batch/status/{batch_id}", + "queue_depth": queue.get_queue_depth(), + } + ) + else: + # Sync mode: Process immediately and return results + service = BatchUploadService(admin_db) + result = service.process_zip_upload( + admin_token=admin_token, + zip_filename=file.filename, + zip_content=zip_content, + upload_source=upload_source, + ) + + logger.info( + f"Batch upload completed: batch_id={result.get('batch_id')}, " + f"status={result.get('status')}, files={result.get('successful_files')}" + ) + + return result + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error processing batch upload: {e}", exc_info=True) + raise HTTPException( + status_code=500, + detail="Failed to process batch upload. Please contact support." + ) + + +@router.get("/status/{batch_id}") +async def get_batch_status( + batch_id: str, + admin_token: Annotated[str, Depends(validate_admin_token)] = None, + admin_db: Annotated[AdminDB, Depends(get_admin_db)] = None, +) -> dict: + """Get batch upload status and file processing details. + + Args: + batch_id: Batch upload ID + admin_token: Admin authentication token + admin_db: Admin database interface + + Returns: + Batch status with file processing details + """ + # Validate UUID format + try: + batch_uuid = UUID(batch_id) + except ValueError: + raise HTTPException(status_code=400, detail="Invalid batch ID format") + + # Check batch exists and verify ownership + batch = admin_db.get_batch_upload(batch_uuid) + if not batch: + raise HTTPException(status_code=404, detail="Batch not found") + + # CRITICAL: Verify ownership + if batch.admin_token != admin_token: + raise HTTPException( + status_code=403, + detail="You do not have access to this batch" + ) + + # Now safe to return details + service = BatchUploadService(admin_db) + result = service.get_batch_status(batch_id) + + return result + + +@router.get("/list") +async def list_batch_uploads( + admin_token: Annotated[str, Depends(validate_admin_token)] = None, + admin_db: Annotated[AdminDB, Depends(get_admin_db)] = None, + limit: int = 50, + offset: int = 0, +) -> dict: + """List batch uploads for the current admin token. + + Args: + admin_token: Admin authentication token + admin_db: Admin database interface + limit: Maximum number of results + offset: Offset for pagination + + Returns: + List of batch uploads + """ + # Validate pagination parameters + if limit < 1 or limit > 100: + raise HTTPException(status_code=400, detail="Limit must be between 1 and 100") + if offset < 0: + raise HTTPException(status_code=400, detail="Offset must be non-negative") + + # Get batch uploads filtered by admin token + batches, total = admin_db.get_batch_uploads_by_token( + admin_token=admin_token, + limit=limit, + offset=offset, + ) + + return { + "batches": [ + { + "batch_id": str(b.batch_id), + "filename": b.filename, + "status": b.status, + "total_files": b.total_files, + "successful_files": b.successful_files, + "failed_files": b.failed_files, + "created_at": b.created_at.isoformat() if b.created_at else None, + "completed_at": b.completed_at.isoformat() if b.completed_at else None, + } + for b in batches + ], + "total": total, + "limit": limit, + "offset": offset, + } diff --git a/src/web/api/v1/public/__init__.py b/src/web/api/v1/public/__init__.py new file mode 100644 index 0000000..8776b9b --- /dev/null +++ b/src/web/api/v1/public/__init__.py @@ -0,0 +1,16 @@ +""" +Public API v1 + +Customer-facing endpoints for inference, async processing, and labeling. +""" + +from src.web.api.v1.public.inference import create_inference_router +from src.web.api.v1.public.async_api import create_async_router, set_async_service +from src.web.api.v1.public.labeling import create_labeling_router + +__all__ = [ + "create_inference_router", + "create_async_router", + "set_async_service", + "create_labeling_router", +] diff --git a/src/web/api/v1/public/async_api.py b/src/web/api/v1/public/async_api.py new file mode 100644 index 0000000..6d5e3f2 --- /dev/null +++ b/src/web/api/v1/public/async_api.py @@ -0,0 +1,372 @@ +""" +Async API Routes + +FastAPI endpoints for async invoice processing. +""" + +import logging +from pathlib import Path +from typing import Annotated +from uuid import UUID + +from fastapi import APIRouter, File, HTTPException, Query, UploadFile + +from src.web.dependencies import ( + ApiKeyDep, + AsyncDBDep, + PollRateLimitDep, + SubmitRateLimitDep, +) +from src.web.schemas.inference import ( + AsyncRequestItem, + AsyncRequestsListResponse, + AsyncResultResponse, + AsyncStatus, + AsyncStatusResponse, + AsyncSubmitResponse, + DetectionResult, + InferenceResult, +) +from src.web.schemas.common import ErrorResponse + + +def _validate_request_id(request_id: str) -> None: + """Validate that request_id is a valid UUID format.""" + try: + UUID(request_id) + except ValueError: + raise HTTPException( + status_code=400, + detail="Invalid request ID format. Must be a valid UUID.", + ) + + +logger = logging.getLogger(__name__) + +# Global reference to async processing service (set during app startup) +_async_service = None + + +def set_async_service(service) -> None: + """Set the async processing service instance.""" + global _async_service + _async_service = service + + +def get_async_service(): + """Get the async processing service instance.""" + if _async_service is None: + raise RuntimeError("AsyncProcessingService not initialized") + return _async_service + + +def create_async_router(allowed_extensions: tuple[str, ...]) -> APIRouter: + """Create async API router.""" + router = APIRouter(prefix="/async", tags=["Async Processing"]) + + @router.post( + "/submit", + response_model=AsyncSubmitResponse, + responses={ + 400: {"model": ErrorResponse, "description": "Invalid file"}, + 401: {"model": ErrorResponse, "description": "Invalid API key"}, + 429: {"model": ErrorResponse, "description": "Rate limit exceeded"}, + 503: {"model": ErrorResponse, "description": "Queue full"}, + }, + summary="Submit PDF for async processing", + description="Submit a PDF or image file for asynchronous processing. " + "Returns a request_id that can be used to poll for results.", + ) + async def submit_document( + api_key: SubmitRateLimitDep, + file: UploadFile = File(..., description="PDF or image file to process"), + ) -> AsyncSubmitResponse: + """Submit a document for async processing.""" + # Validate filename + if not file.filename: + raise HTTPException(status_code=400, detail="Filename is required") + + # Validate file extension + file_ext = Path(file.filename).suffix.lower() + if file_ext not in allowed_extensions: + raise HTTPException( + status_code=400, + detail=f"Unsupported file type: {file_ext}. " + f"Allowed: {', '.join(allowed_extensions)}", + ) + + # Read file content + try: + content = await file.read() + except Exception as e: + logger.error(f"Failed to read uploaded file: {e}") + raise HTTPException(status_code=400, detail="Failed to read file") + + # Check file size (get from config via service) + service = get_async_service() + max_size = service._async_config.max_file_size_mb * 1024 * 1024 + if len(content) > max_size: + raise HTTPException( + status_code=400, + detail=f"File too large. Maximum size: " + f"{service._async_config.max_file_size_mb}MB", + ) + + # Submit request + result = service.submit_request( + api_key=api_key, + file_content=content, + filename=file.filename, + content_type=file.content_type or "application/octet-stream", + ) + + if not result.success: + if "queue" in (result.error or "").lower(): + raise HTTPException(status_code=503, detail=result.error) + raise HTTPException(status_code=500, detail=result.error) + + return AsyncSubmitResponse( + status="accepted", + message="Request submitted for processing", + request_id=result.request_id, + estimated_wait_seconds=result.estimated_wait_seconds, + poll_url=f"/api/v1/async/status/{result.request_id}", + ) + + @router.get( + "/status/{request_id}", + response_model=AsyncStatusResponse, + responses={ + 401: {"model": ErrorResponse, "description": "Invalid API key"}, + 404: {"model": ErrorResponse, "description": "Request not found"}, + 429: {"model": ErrorResponse, "description": "Polling too frequently"}, + }, + summary="Get request status", + description="Get the current processing status of an async request.", + ) + async def get_status( + request_id: str, + api_key: PollRateLimitDep, + db: AsyncDBDep, + ) -> AsyncStatusResponse: + """Get the status of an async request.""" + # Validate UUID format + _validate_request_id(request_id) + + # Get request from database (validates API key ownership) + request = db.get_request_by_api_key(request_id, api_key) + + if request is None: + raise HTTPException( + status_code=404, + detail="Request not found or does not belong to this API key", + ) + + # Get queue position for pending requests + position = None + if request.status == "pending": + position = db.get_queue_position(request_id) + + # Build result URL for completed requests + result_url = None + if request.status == "completed": + result_url = f"/api/v1/async/result/{request_id}" + + return AsyncStatusResponse( + request_id=str(request.request_id), + status=AsyncStatus(request.status), + filename=request.filename, + created_at=request.created_at, + started_at=request.started_at, + completed_at=request.completed_at, + position_in_queue=position, + error_message=request.error_message, + result_url=result_url, + ) + + @router.get( + "/result/{request_id}", + response_model=AsyncResultResponse, + responses={ + 401: {"model": ErrorResponse, "description": "Invalid API key"}, + 404: {"model": ErrorResponse, "description": "Request not found"}, + 409: {"model": ErrorResponse, "description": "Request not completed"}, + 429: {"model": ErrorResponse, "description": "Polling too frequently"}, + }, + summary="Get extraction results", + description="Get the extraction results for a completed async request.", + ) + async def get_result( + request_id: str, + api_key: PollRateLimitDep, + db: AsyncDBDep, + ) -> AsyncResultResponse: + """Get the results of a completed async request.""" + # Validate UUID format + _validate_request_id(request_id) + + # Get request from database (validates API key ownership) + request = db.get_request_by_api_key(request_id, api_key) + + if request is None: + raise HTTPException( + status_code=404, + detail="Request not found or does not belong to this API key", + ) + + # Check if completed or failed + if request.status not in ("completed", "failed"): + raise HTTPException( + status_code=409, + detail=f"Request not yet completed. Current status: {request.status}", + ) + + # Build inference result from stored data + inference_result = None + if request.result: + # Convert detections to DetectionResult objects + detections = [] + for d in request.result.get("detections", []): + detections.append(DetectionResult( + field=d.get("field", ""), + confidence=d.get("confidence", 0.0), + bbox=d.get("bbox", [0, 0, 0, 0]), + )) + + inference_result = InferenceResult( + document_id=request.result.get("document_id", str(request.request_id)[:8]), + success=request.result.get("success", False), + document_type=request.result.get("document_type", "invoice"), + fields=request.result.get("fields", {}), + confidence=request.result.get("confidence", {}), + detections=detections, + processing_time_ms=request.processing_time_ms or 0.0, + errors=request.result.get("errors", []), + ) + + # Build visualization URL + viz_url = None + if request.visualization_path: + viz_url = f"/api/v1/results/{request.visualization_path}" + + return AsyncResultResponse( + request_id=str(request.request_id), + status=AsyncStatus(request.status), + processing_time_ms=request.processing_time_ms or 0.0, + result=inference_result, + visualization_url=viz_url, + ) + + @router.get( + "/requests", + response_model=AsyncRequestsListResponse, + responses={ + 401: {"model": ErrorResponse, "description": "Invalid API key"}, + }, + summary="List requests", + description="List all async requests for the authenticated API key.", + ) + async def list_requests( + api_key: ApiKeyDep, + db: AsyncDBDep, + status: Annotated[ + str | None, + Query(description="Filter by status (pending, processing, completed, failed)"), + ] = None, + limit: Annotated[ + int, + Query(ge=1, le=100, description="Maximum number of results"), + ] = 20, + offset: Annotated[ + int, + Query(ge=0, description="Pagination offset"), + ] = 0, + ) -> AsyncRequestsListResponse: + """List all requests for the authenticated API key.""" + # Validate status filter + if status and status not in ("pending", "processing", "completed", "failed"): + raise HTTPException( + status_code=400, + detail=f"Invalid status filter: {status}. " + "Must be one of: pending, processing, completed, failed", + ) + + # Get requests from database + requests, total = db.get_requests_by_api_key( + api_key=api_key, + status=status, + limit=limit, + offset=offset, + ) + + # Convert to response items + items = [ + AsyncRequestItem( + request_id=str(r.request_id), + status=AsyncStatus(r.status), + filename=r.filename, + file_size=r.file_size, + created_at=r.created_at, + completed_at=r.completed_at, + ) + for r in requests + ] + + return AsyncRequestsListResponse( + total=total, + limit=limit, + offset=offset, + requests=items, + ) + + @router.delete( + "/requests/{request_id}", + responses={ + 401: {"model": ErrorResponse, "description": "Invalid API key"}, + 404: {"model": ErrorResponse, "description": "Request not found"}, + 409: {"model": ErrorResponse, "description": "Cannot delete processing request"}, + }, + summary="Cancel/delete request", + description="Cancel a pending request or delete a completed/failed request.", + ) + async def delete_request( + request_id: str, + api_key: ApiKeyDep, + db: AsyncDBDep, + ) -> dict: + """Delete or cancel an async request.""" + # Validate UUID format + _validate_request_id(request_id) + + # Get request from database + request = db.get_request_by_api_key(request_id, api_key) + + if request is None: + raise HTTPException( + status_code=404, + detail="Request not found or does not belong to this API key", + ) + + # Cannot delete processing requests + if request.status == "processing": + raise HTTPException( + status_code=409, + detail="Cannot delete a request that is currently processing", + ) + + # Delete from database (will cascade delete related records) + conn = db.connect() + with conn.cursor() as cursor: + cursor.execute( + "DELETE FROM async_requests WHERE request_id = %s", + (request_id,), + ) + conn.commit() + + return { + "status": "deleted", + "request_id": request_id, + "message": "Request deleted successfully", + } + + return router diff --git a/src/web/routes.py b/src/web/api/v1/public/inference.py similarity index 96% rename from src/web/routes.py rename to src/web/api/v1/public/inference.py index c193bcb..a3d0849 100644 --- a/src/web/routes.py +++ b/src/web/api/v1/public/inference.py @@ -1,5 +1,5 @@ """ -API Routes +Inference API Routes FastAPI route definitions for the inference API. """ @@ -15,23 +15,22 @@ from typing import TYPE_CHECKING from fastapi import APIRouter, File, HTTPException, UploadFile, status from fastapi.responses import FileResponse -from .schemas import ( - BatchInferenceResponse, +from src.web.schemas.inference import ( DetectionResult, - ErrorResponse, HealthResponse, InferenceResponse, InferenceResult, ) +from src.web.schemas.common import ErrorResponse if TYPE_CHECKING: - from .services import InferenceService - from .config import StorageConfig + from src.web.services import InferenceService + from src.web.config import StorageConfig logger = logging.getLogger(__name__) -def create_api_router( +def create_inference_router( inference_service: "InferenceService", storage_config: "StorageConfig", ) -> APIRouter: diff --git a/src/web/api/v1/public/labeling.py b/src/web/api/v1/public/labeling.py new file mode 100644 index 0000000..75e5125 --- /dev/null +++ b/src/web/api/v1/public/labeling.py @@ -0,0 +1,203 @@ +""" +Labeling API Routes + +FastAPI endpoints for pre-labeling documents with expected field values. +""" + +from __future__ import annotations + +import json +import logging +from pathlib import Path +from typing import TYPE_CHECKING + +from fastapi import APIRouter, Depends, File, Form, HTTPException, UploadFile, status + +from src.data.admin_db import AdminDB +from src.web.schemas.labeling import PreLabelResponse +from src.web.schemas.common import ErrorResponse + +if TYPE_CHECKING: + from src.web.services import InferenceService + from src.web.config import StorageConfig + +logger = logging.getLogger(__name__) + +# Storage directory for pre-label uploads (legacy, now uses storage_config) +PRE_LABEL_UPLOAD_DIR = Path("data/pre_label_uploads") + + +def _convert_pdf_to_images( + document_id: str, content: bytes, page_count: int, images_dir: Path, dpi: int +) -> None: + """Convert PDF pages to images for annotation.""" + import fitz + + doc_images_dir = images_dir / document_id + doc_images_dir.mkdir(parents=True, exist_ok=True) + + pdf_doc = fitz.open(stream=content, filetype="pdf") + + for page_num in range(page_count): + page = pdf_doc[page_num] + mat = fitz.Matrix(dpi / 72, dpi / 72) + pix = page.get_pixmap(matrix=mat) + + image_path = doc_images_dir / f"page_{page_num + 1}.png" + pix.save(str(image_path)) + + pdf_doc.close() + + +def get_admin_db() -> AdminDB: + """Get admin database instance.""" + return AdminDB() + + +def create_labeling_router( + inference_service: "InferenceService", + storage_config: "StorageConfig", +) -> APIRouter: + """ + Create API router with labeling endpoints. + + Args: + inference_service: Inference service instance + storage_config: Storage configuration + + Returns: + Configured APIRouter + """ + router = APIRouter(prefix="/api/v1", tags=["labeling"]) + + # Ensure upload directory exists + PRE_LABEL_UPLOAD_DIR.mkdir(parents=True, exist_ok=True) + + @router.post( + "/pre-label", + response_model=PreLabelResponse, + responses={ + 400: {"model": ErrorResponse, "description": "Invalid file or field values"}, + 500: {"model": ErrorResponse, "description": "Processing error"}, + }, + summary="Pre-label document with expected values", + description="Upload a document with expected field values for pre-labeling. Returns document_id for result retrieval.", + ) + async def pre_label( + file: UploadFile = File(..., description="PDF or image file to process"), + field_values: str = Form( + ..., + description="JSON object with expected field values. " + "Keys: InvoiceNumber, InvoiceDate, InvoiceDueDate, Amount, OCR, " + "Bankgiro, Plusgiro, customer_number, supplier_organisation_number", + ), + db: AdminDB = Depends(get_admin_db), + ) -> PreLabelResponse: + """ + Upload a document with expected field values for pre-labeling. + + Returns document_id which can be used to retrieve results later. + + Example field_values JSON: + ```json + { + "InvoiceNumber": "12345", + "Amount": "1500.00", + "Bankgiro": "123-4567", + "OCR": "1234567890" + } + ``` + """ + # Parse field_values JSON + try: + expected_values = json.loads(field_values) + if not isinstance(expected_values, dict): + raise ValueError("field_values must be a JSON object") + except json.JSONDecodeError as e: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Invalid JSON in field_values: {e}", + ) + + # Validate file extension + if not file.filename: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Filename is required", + ) + + file_ext = Path(file.filename).suffix.lower() + if file_ext not in storage_config.allowed_extensions: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Unsupported file type: {file_ext}. Allowed: {storage_config.allowed_extensions}", + ) + + # Read file content + try: + content = await file.read() + except Exception as e: + logger.error(f"Failed to read uploaded file: {e}") + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Failed to read file", + ) + + # Get page count for PDF + page_count = 1 + if file_ext == ".pdf": + try: + import fitz + pdf_doc = fitz.open(stream=content, filetype="pdf") + page_count = len(pdf_doc) + pdf_doc.close() + except Exception as e: + logger.warning(f"Failed to get PDF page count: {e}") + + # Create document record with field_values + document_id = db.create_document( + filename=file.filename, + file_size=len(content), + content_type=file.content_type or "application/octet-stream", + file_path="", # Will update after saving + page_count=page_count, + upload_source="api", + csv_field_values=expected_values, + ) + + # Save file to admin uploads + file_path = storage_config.admin_upload_dir / f"{document_id}{file_ext}" + try: + file_path.write_bytes(content) + except Exception as e: + logger.error(f"Failed to save file: {e}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Failed to save file", + ) + + # Update file path in database + db.update_document_file_path(document_id, str(file_path)) + + # Convert PDF to images for annotation UI + if file_ext == ".pdf": + try: + _convert_pdf_to_images( + document_id, content, page_count, + storage_config.admin_images_dir, storage_config.dpi + ) + except Exception as e: + logger.error(f"Failed to convert PDF to images: {e}") + + # Trigger auto-labeling + db.update_document_status( + document_id=document_id, + status="auto_labeling", + auto_label_status="pending", + ) + + logger.info(f"Pre-label document {document_id} created with {len(expected_values)} expected fields") + + return PreLabelResponse(document_id=document_id) + + return router diff --git a/src/web/app.py b/src/web/app.py index c93157b..2f46fd8 100644 --- a/src/web/app.py +++ b/src/web/app.py @@ -17,8 +17,39 @@ from fastapi.staticfiles import StaticFiles from fastapi.responses import HTMLResponse from .config import AppConfig, default_config -from .routes import create_api_router -from .services import InferenceService +from src.web.services import InferenceService + +# Public API imports +from src.web.api.v1.public import ( + create_inference_router, + create_async_router, + set_async_service, + create_labeling_router, +) + +# Async processing imports +from src.data.async_request_db import AsyncRequestDB +from src.web.workers.async_queue import AsyncTaskQueue +from src.web.services.async_processing import AsyncProcessingService +from src.web.dependencies import init_dependencies +from src.web.core.rate_limiter import RateLimiter + +# Admin API imports +from src.web.api.v1.admin import ( + create_annotation_router, + create_auth_router, + create_documents_router, + create_locks_router, + create_training_router, +) +from src.web.core.scheduler import start_scheduler, stop_scheduler +from src.web.core.autolabel_scheduler import start_autolabel_scheduler, stop_autolabel_scheduler + +# Batch upload imports +from src.web.api.v1.batch.routes import router as batch_upload_router +from src.web.workers.batch_queue import init_batch_queue, shutdown_batch_queue +from src.web.services.batch_upload import BatchUploadService +from src.data.admin_db import AdminDB if TYPE_CHECKING: from collections.abc import AsyncGenerator @@ -44,11 +75,38 @@ def create_app(config: AppConfig | None = None) -> FastAPI: storage_config=config.storage, ) + # Create async processing components + async_db = AsyncRequestDB() + rate_limiter = RateLimiter(async_db) + task_queue = AsyncTaskQueue( + max_size=config.async_processing.queue_max_size, + worker_count=config.async_processing.worker_count, + ) + async_service = AsyncProcessingService( + inference_service=inference_service, + db=async_db, + queue=task_queue, + rate_limiter=rate_limiter, + async_config=config.async_processing, + storage_config=config.storage, + ) + + # Initialize dependencies for FastAPI + init_dependencies(async_db, rate_limiter) + set_async_service(async_service) + @asynccontextmanager async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: """Application lifespan manager.""" logger.info("Starting Invoice Inference API...") + # Initialize database tables + try: + async_db.create_tables() + logger.info("Async database tables ready") + except Exception as e: + logger.error(f"Failed to initialize async database: {e}") + # Initialize inference service on startup try: inference_service.initialize() @@ -57,10 +115,75 @@ def create_app(config: AppConfig | None = None) -> FastAPI: logger.error(f"Failed to initialize inference service: {e}") # Continue anyway - service will retry on first request + # Start async processing service + try: + async_service.start() + logger.info("Async processing service started") + except Exception as e: + logger.error(f"Failed to start async processing: {e}") + + # Start batch upload queue + try: + admin_db = AdminDB() + batch_service = BatchUploadService(admin_db) + init_batch_queue(batch_service) + logger.info("Batch upload queue started") + except Exception as e: + logger.error(f"Failed to start batch upload queue: {e}") + + # Start training scheduler + try: + start_scheduler() + logger.info("Training scheduler started") + except Exception as e: + logger.error(f"Failed to start training scheduler: {e}") + + # Start auto-label scheduler + try: + start_autolabel_scheduler() + logger.info("AutoLabel scheduler started") + except Exception as e: + logger.error(f"Failed to start autolabel scheduler: {e}") + yield logger.info("Shutting down Invoice Inference API...") + # Stop auto-label scheduler + try: + stop_autolabel_scheduler() + logger.info("AutoLabel scheduler stopped") + except Exception as e: + logger.error(f"Error stopping autolabel scheduler: {e}") + + # Stop training scheduler + try: + stop_scheduler() + logger.info("Training scheduler stopped") + except Exception as e: + logger.error(f"Error stopping training scheduler: {e}") + + # Stop batch upload queue + try: + shutdown_batch_queue() + logger.info("Batch upload queue stopped") + except Exception as e: + logger.error(f"Error stopping batch upload queue: {e}") + + # Stop async processing service + try: + async_service.stop(timeout=30.0) + logger.info("Async processing service stopped") + except Exception as e: + logger.error(f"Error stopping async service: {e}") + + # Close database connection + try: + async_db.close() + logger.info("Database connection closed") + except Exception as e: + logger.error(f"Error closing database: {e}") + # Create FastAPI app app = FastAPI( title="Invoice Field Extraction API", @@ -106,9 +229,34 @@ def create_app(config: AppConfig | None = None) -> FastAPI: name="results", ) - # Include API routes - api_router = create_api_router(inference_service, config.storage) - app.include_router(api_router) + # Include public API routes + inference_router = create_inference_router(inference_service, config.storage) + app.include_router(inference_router) + + async_router = create_async_router(config.storage.allowed_extensions) + app.include_router(async_router, prefix="/api/v1") + + labeling_router = create_labeling_router(inference_service, config.storage) + app.include_router(labeling_router) + + # Include admin API routes + auth_router = create_auth_router() + app.include_router(auth_router, prefix="/api/v1") + + documents_router = create_documents_router(config.storage) + app.include_router(documents_router, prefix="/api/v1") + + locks_router = create_locks_router() + app.include_router(locks_router, prefix="/api/v1") + + annotation_router = create_annotation_router() + app.include_router(annotation_router, prefix="/api/v1") + + training_router = create_training_router() + app.include_router(training_router, prefix="/api/v1") + + # Include batch upload routes + app.include_router(batch_upload_router) # Root endpoint - serve HTML UI @app.get("/", response_class=HTMLResponse) diff --git a/src/web/config.py b/src/web/config.py index 0ef77e0..d3701e6 100644 --- a/src/web/config.py +++ b/src/web/config.py @@ -8,6 +8,8 @@ from dataclasses import dataclass, field from pathlib import Path from typing import Any +from src.config import DEFAULT_DPI, PATHS + @dataclass(frozen=True) class ModelConfig: @@ -16,7 +18,7 @@ class ModelConfig: model_path: Path = Path("runs/train/invoice_fields/weights/best.pt") confidence_threshold: float = 0.5 use_gpu: bool = True - dpi: int = 150 + dpi: int = DEFAULT_DPI @dataclass(frozen=True) @@ -32,19 +34,59 @@ class ServerConfig: @dataclass(frozen=True) class StorageConfig: - """File storage configuration.""" + """File storage configuration. + + Note: admin_upload_dir uses PATHS['pdf_dir'] so uploaded PDFs are stored + directly in raw_pdfs directory. This ensures consistency with CLI autolabel + and avoids storing duplicate files. + """ upload_dir: Path = Path("uploads") result_dir: Path = Path("results") + admin_upload_dir: Path = field(default_factory=lambda: Path(PATHS["pdf_dir"])) + admin_images_dir: Path = Path("data/admin_images") max_file_size_mb: int = 50 allowed_extensions: tuple[str, ...] = (".pdf", ".png", ".jpg", ".jpeg") + dpi: int = DEFAULT_DPI def __post_init__(self) -> None: """Create directories if they don't exist.""" object.__setattr__(self, "upload_dir", Path(self.upload_dir)) object.__setattr__(self, "result_dir", Path(self.result_dir)) + object.__setattr__(self, "admin_upload_dir", Path(self.admin_upload_dir)) + object.__setattr__(self, "admin_images_dir", Path(self.admin_images_dir)) self.upload_dir.mkdir(parents=True, exist_ok=True) self.result_dir.mkdir(parents=True, exist_ok=True) + self.admin_upload_dir.mkdir(parents=True, exist_ok=True) + self.admin_images_dir.mkdir(parents=True, exist_ok=True) + + +@dataclass(frozen=True) +class AsyncConfig: + """Async processing configuration.""" + + # Queue settings + queue_max_size: int = 100 + worker_count: int = 1 + task_timeout_seconds: int = 300 + + # Rate limiting defaults + default_requests_per_minute: int = 10 + default_max_concurrent_jobs: int = 3 + default_min_poll_interval_ms: int = 1000 + + # Storage + result_retention_days: int = 7 + temp_upload_dir: Path = Path("uploads/async") + max_file_size_mb: int = 50 + + # Cleanup + cleanup_interval_hours: int = 1 + + def __post_init__(self) -> None: + """Create directories if they don't exist.""" + object.__setattr__(self, "temp_upload_dir", Path(self.temp_upload_dir)) + self.temp_upload_dir.mkdir(parents=True, exist_ok=True) @dataclass @@ -54,6 +96,7 @@ class AppConfig: model: ModelConfig = field(default_factory=ModelConfig) server: ServerConfig = field(default_factory=ServerConfig) storage: StorageConfig = field(default_factory=StorageConfig) + async_processing: AsyncConfig = field(default_factory=AsyncConfig) @classmethod def from_dict(cls, config_dict: dict[str, Any]) -> "AppConfig": @@ -62,6 +105,7 @@ class AppConfig: model=ModelConfig(**config_dict.get("model", {})), server=ServerConfig(**config_dict.get("server", {})), storage=StorageConfig(**config_dict.get("storage", {})), + async_processing=AsyncConfig(**config_dict.get("async_processing", {})), ) diff --git a/src/web/core/__init__.py b/src/web/core/__init__.py new file mode 100644 index 0000000..44c32e1 --- /dev/null +++ b/src/web/core/__init__.py @@ -0,0 +1,28 @@ +""" +Core Components + +Reusable core functionality: authentication, rate limiting, scheduling. +""" + +from src.web.core.auth import validate_admin_token, get_admin_db, AdminTokenDep, AdminDBDep +from src.web.core.rate_limiter import RateLimiter +from src.web.core.scheduler import start_scheduler, stop_scheduler, get_training_scheduler +from src.web.core.autolabel_scheduler import ( + start_autolabel_scheduler, + stop_autolabel_scheduler, + get_autolabel_scheduler, +) + +__all__ = [ + "validate_admin_token", + "get_admin_db", + "AdminTokenDep", + "AdminDBDep", + "RateLimiter", + "start_scheduler", + "stop_scheduler", + "get_training_scheduler", + "start_autolabel_scheduler", + "stop_autolabel_scheduler", + "get_autolabel_scheduler", +] diff --git a/src/web/core/auth.py b/src/web/core/auth.py new file mode 100644 index 0000000..0e23cac --- /dev/null +++ b/src/web/core/auth.py @@ -0,0 +1,60 @@ +""" +Admin Authentication + +FastAPI dependencies for admin token authentication. +""" + +import logging +from typing import Annotated + +from fastapi import Depends, Header, HTTPException + +from src.data.admin_db import AdminDB +from src.data.database import get_session_context + +logger = logging.getLogger(__name__) + +# Global AdminDB instance +_admin_db: AdminDB | None = None + + +def get_admin_db() -> AdminDB: + """Get the AdminDB instance.""" + global _admin_db + if _admin_db is None: + _admin_db = AdminDB() + return _admin_db + + +def reset_admin_db() -> None: + """Reset the AdminDB instance (for testing).""" + global _admin_db + _admin_db = None + + +async def validate_admin_token( + x_admin_token: Annotated[str | None, Header()] = None, + admin_db: AdminDB = Depends(get_admin_db), +) -> str: + """Validate admin token from header.""" + if not x_admin_token: + raise HTTPException( + status_code=401, + detail="Admin token required. Provide X-Admin-Token header.", + ) + + if not admin_db.is_valid_admin_token(x_admin_token): + raise HTTPException( + status_code=401, + detail="Invalid or expired admin token.", + ) + + # Update last used timestamp + admin_db.update_admin_token_usage(x_admin_token) + + return x_admin_token + + +# Type alias for dependency injection +AdminTokenDep = Annotated[str, Depends(validate_admin_token)] +AdminDBDep = Annotated[AdminDB, Depends(get_admin_db)] diff --git a/src/web/core/autolabel_scheduler.py b/src/web/core/autolabel_scheduler.py new file mode 100644 index 0000000..a3b3c0d --- /dev/null +++ b/src/web/core/autolabel_scheduler.py @@ -0,0 +1,153 @@ +""" +Auto-Label Scheduler + +Background scheduler for processing documents pending auto-labeling. +""" + +import logging +import threading +from pathlib import Path + +from src.data.admin_db import AdminDB +from src.web.services.db_autolabel import ( + get_pending_autolabel_documents, + process_document_autolabel, +) + +logger = logging.getLogger(__name__) + + +class AutoLabelScheduler: + """Scheduler for auto-labeling tasks.""" + + def __init__( + self, + check_interval_seconds: int = 10, + batch_size: int = 5, + output_dir: Path | None = None, + ): + """ + Initialize auto-label scheduler. + + Args: + check_interval_seconds: Interval to check for pending tasks + batch_size: Number of documents to process per batch + output_dir: Output directory for temporary files + """ + self._check_interval = check_interval_seconds + self._batch_size = batch_size + self._output_dir = output_dir or Path("data/autolabel_output") + self._running = False + self._thread: threading.Thread | None = None + self._stop_event = threading.Event() + self._db = AdminDB() + + def start(self) -> None: + """Start the scheduler.""" + if self._running: + logger.warning("AutoLabel scheduler already running") + return + + self._running = True + self._stop_event.clear() + self._thread = threading.Thread(target=self._run_loop, daemon=True) + self._thread.start() + logger.info("AutoLabel scheduler started") + + def stop(self) -> None: + """Stop the scheduler.""" + if not self._running: + return + + self._running = False + self._stop_event.set() + + if self._thread: + self._thread.join(timeout=5) + self._thread = None + + logger.info("AutoLabel scheduler stopped") + + @property + def is_running(self) -> bool: + """Check if scheduler is running.""" + return self._running + + def _run_loop(self) -> None: + """Main scheduler loop.""" + while self._running: + try: + self._process_pending_documents() + except Exception as e: + logger.error(f"Error in autolabel scheduler loop: {e}", exc_info=True) + + # Wait for next check interval + self._stop_event.wait(timeout=self._check_interval) + + def _process_pending_documents(self) -> None: + """Check and process pending auto-label documents.""" + try: + documents = get_pending_autolabel_documents( + self._db, limit=self._batch_size + ) + + if not documents: + return + + logger.info(f"Processing {len(documents)} pending autolabel documents") + + for doc in documents: + if self._stop_event.is_set(): + break + + try: + result = process_document_autolabel( + document=doc, + db=self._db, + output_dir=self._output_dir, + ) + + if result.get("success"): + logger.info( + f"AutoLabel completed for document {doc.document_id}" + ) + else: + logger.warning( + f"AutoLabel failed for document {doc.document_id}: " + f"{result.get('error', 'Unknown error')}" + ) + + except Exception as e: + logger.error( + f"Error processing document {doc.document_id}: {e}", + exc_info=True, + ) + + except Exception as e: + logger.error(f"Error fetching pending documents: {e}", exc_info=True) + + +# Global scheduler instance +_autolabel_scheduler: AutoLabelScheduler | None = None + + +def get_autolabel_scheduler() -> AutoLabelScheduler: + """Get the auto-label scheduler instance.""" + global _autolabel_scheduler + if _autolabel_scheduler is None: + _autolabel_scheduler = AutoLabelScheduler() + return _autolabel_scheduler + + +def start_autolabel_scheduler() -> None: + """Start the global auto-label scheduler.""" + scheduler = get_autolabel_scheduler() + scheduler.start() + + +def stop_autolabel_scheduler() -> None: + """Stop the global auto-label scheduler.""" + global _autolabel_scheduler + if _autolabel_scheduler: + _autolabel_scheduler.stop() + _autolabel_scheduler = None diff --git a/src/web/core/rate_limiter.py b/src/web/core/rate_limiter.py new file mode 100644 index 0000000..95297a9 --- /dev/null +++ b/src/web/core/rate_limiter.py @@ -0,0 +1,211 @@ +""" +Rate Limiter Implementation + +Thread-safe rate limiter with sliding window algorithm for API key-based limiting. +""" + +import logging +import time +from collections import defaultdict +from dataclasses import dataclass +from datetime import datetime, timedelta +from threading import Lock +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from src.data.async_request_db import AsyncRequestDB + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True) +class RateLimitConfig: + """Rate limit configuration for an API key.""" + + requests_per_minute: int = 10 + max_concurrent_jobs: int = 3 + min_poll_interval_ms: int = 1000 # Minimum time between status polls + + +@dataclass +class RateLimitStatus: + """Current rate limit status.""" + + allowed: bool + remaining_requests: int + reset_at: datetime + retry_after_seconds: int | None = None + reason: str | None = None + + +class RateLimiter: + """ + Thread-safe rate limiter with sliding window algorithm. + + Tracks: + - Requests per minute (sliding window) + - Concurrent active jobs + - Poll frequency per request_id + """ + + def __init__(self, db: "AsyncRequestDB") -> None: + self._db = db + self._lock = Lock() + # In-memory tracking for fast checks + self._request_windows: dict[str, list[float]] = defaultdict(list) + # (api_key, request_id) -> last_poll timestamp + self._poll_timestamps: dict[tuple[str, str], float] = {} + # Cache for API key configs (TTL 60 seconds) + self._config_cache: dict[str, tuple[RateLimitConfig, float]] = {} + self._config_cache_ttl = 60.0 + + def check_submit_limit(self, api_key: str) -> RateLimitStatus: + """Check if API key can submit a new request.""" + config = self._get_config(api_key) + + with self._lock: + now = time.time() + window_start = now - 60 # 1 minute window + + # Clean old entries + self._request_windows[api_key] = [ + ts for ts in self._request_windows[api_key] + if ts > window_start + ] + + current_count = len(self._request_windows[api_key]) + + if current_count >= config.requests_per_minute: + oldest = min(self._request_windows[api_key]) + retry_after = int(oldest + 60 - now) + 1 + return RateLimitStatus( + allowed=False, + remaining_requests=0, + reset_at=datetime.utcnow() + timedelta(seconds=retry_after), + retry_after_seconds=max(1, retry_after), + reason="Rate limit exceeded: too many requests per minute", + ) + + # Check concurrent jobs (query database) - inside lock for thread safety + active_jobs = self._db.count_active_jobs(api_key) + if active_jobs >= config.max_concurrent_jobs: + return RateLimitStatus( + allowed=False, + remaining_requests=config.requests_per_minute - current_count, + reset_at=datetime.utcnow() + timedelta(seconds=30), + retry_after_seconds=30, + reason=f"Max concurrent jobs ({config.max_concurrent_jobs}) reached", + ) + + return RateLimitStatus( + allowed=True, + remaining_requests=config.requests_per_minute - current_count - 1, + reset_at=datetime.utcnow() + timedelta(seconds=60), + ) + + def record_request(self, api_key: str) -> None: + """Record a successful request submission.""" + with self._lock: + self._request_windows[api_key].append(time.time()) + + # Also record in database for persistence + try: + self._db.record_rate_limit_event(api_key, "request") + except Exception as e: + logger.warning(f"Failed to record rate limit event: {e}") + + def check_poll_limit(self, api_key: str, request_id: str) -> RateLimitStatus: + """Check if polling is allowed (prevent abuse).""" + config = self._get_config(api_key) + key = (api_key, request_id) + + with self._lock: + now = time.time() + last_poll = self._poll_timestamps.get(key, 0) + elapsed_ms = (now - last_poll) * 1000 + + if elapsed_ms < config.min_poll_interval_ms: + # Suggest exponential backoff + wait_ms = min( + config.min_poll_interval_ms * 2, + 5000, # Max 5 seconds + ) + retry_after = int(wait_ms / 1000) + 1 + return RateLimitStatus( + allowed=False, + remaining_requests=0, + reset_at=datetime.utcnow() + timedelta(milliseconds=wait_ms), + retry_after_seconds=retry_after, + reason="Polling too frequently. Please wait before retrying.", + ) + + # Update poll timestamp + self._poll_timestamps[key] = now + + return RateLimitStatus( + allowed=True, + remaining_requests=999, # No limit on poll count, just frequency + reset_at=datetime.utcnow(), + ) + + def _get_config(self, api_key: str) -> RateLimitConfig: + """Get rate limit config for API key with caching.""" + now = time.time() + + # Check cache + if api_key in self._config_cache: + cached_config, cached_at = self._config_cache[api_key] + if now - cached_at < self._config_cache_ttl: + return cached_config + + # Query database + db_config = self._db.get_api_key_config(api_key) + if db_config: + config = RateLimitConfig( + requests_per_minute=db_config.requests_per_minute, + max_concurrent_jobs=db_config.max_concurrent_jobs, + ) + else: + config = RateLimitConfig() # Default limits + + # Cache result + self._config_cache[api_key] = (config, now) + return config + + def cleanup_poll_timestamps(self, max_age_seconds: int = 3600) -> int: + """Clean up old poll timestamps to prevent memory leak.""" + with self._lock: + now = time.time() + cutoff = now - max_age_seconds + old_keys = [ + k for k, v in self._poll_timestamps.items() + if v < cutoff + ] + for key in old_keys: + del self._poll_timestamps[key] + return len(old_keys) + + def cleanup_request_windows(self) -> None: + """Clean up expired entries from request windows.""" + with self._lock: + now = time.time() + window_start = now - 60 + + for api_key in list(self._request_windows.keys()): + self._request_windows[api_key] = [ + ts for ts in self._request_windows[api_key] + if ts > window_start + ] + # Remove empty entries + if not self._request_windows[api_key]: + del self._request_windows[api_key] + + def get_rate_limit_headers(self, status: RateLimitStatus) -> dict[str, str]: + """Generate rate limit headers for HTTP response.""" + headers = { + "X-RateLimit-Remaining": str(status.remaining_requests), + "X-RateLimit-Reset": status.reset_at.isoformat(), + } + if status.retry_after_seconds: + headers["Retry-After"] = str(status.retry_after_seconds) + return headers diff --git a/src/web/core/scheduler.py b/src/web/core/scheduler.py new file mode 100644 index 0000000..42814f7 --- /dev/null +++ b/src/web/core/scheduler.py @@ -0,0 +1,329 @@ +""" +Admin Training Scheduler + +Background scheduler for training tasks using APScheduler. +""" + +import logging +import threading +from datetime import datetime +from pathlib import Path +from typing import Any + +from src.data.admin_db import AdminDB + +logger = logging.getLogger(__name__) + + +class TrainingScheduler: + """Scheduler for training tasks.""" + + def __init__( + self, + check_interval_seconds: int = 60, + ): + """ + Initialize training scheduler. + + Args: + check_interval_seconds: Interval to check for pending tasks + """ + self._check_interval = check_interval_seconds + self._running = False + self._thread: threading.Thread | None = None + self._stop_event = threading.Event() + self._db = AdminDB() + + def start(self) -> None: + """Start the scheduler.""" + if self._running: + logger.warning("Training scheduler already running") + return + + self._running = True + self._stop_event.clear() + self._thread = threading.Thread(target=self._run_loop, daemon=True) + self._thread.start() + logger.info("Training scheduler started") + + def stop(self) -> None: + """Stop the scheduler.""" + if not self._running: + return + + self._running = False + self._stop_event.set() + + if self._thread: + self._thread.join(timeout=5) + self._thread = None + + logger.info("Training scheduler stopped") + + def _run_loop(self) -> None: + """Main scheduler loop.""" + while self._running: + try: + self._check_pending_tasks() + except Exception as e: + logger.error(f"Error in scheduler loop: {e}") + + # Wait for next check interval + self._stop_event.wait(timeout=self._check_interval) + + def _check_pending_tasks(self) -> None: + """Check and execute pending training tasks.""" + try: + tasks = self._db.get_pending_training_tasks() + + for task in tasks: + task_id = str(task.task_id) + + # Check if scheduled time has passed + if task.scheduled_at and task.scheduled_at > datetime.utcnow(): + continue + + logger.info(f"Starting training task: {task_id}") + + try: + self._execute_task(task_id, task.config or {}) + except Exception as e: + logger.error(f"Training task {task_id} failed: {e}") + self._db.update_training_task_status( + task_id=task_id, + status="failed", + error_message=str(e), + ) + + except Exception as e: + logger.error(f"Error checking pending tasks: {e}") + + def _execute_task(self, task_id: str, config: dict[str, Any]) -> None: + """Execute a training task.""" + # Update status to running + self._db.update_training_task_status(task_id, "running") + self._db.add_training_log(task_id, "INFO", "Training task started") + + try: + # Get training configuration + model_name = config.get("model_name", "yolo11n.pt") + epochs = config.get("epochs", 100) + batch_size = config.get("batch_size", 16) + image_size = config.get("image_size", 640) + learning_rate = config.get("learning_rate", 0.01) + device = config.get("device", "0") + project_name = config.get("project_name", "invoice_fields") + + # Export annotations for training + export_result = self._export_training_data(task_id) + if not export_result: + raise ValueError("Failed to export training data") + + data_yaml = export_result["data_yaml"] + + self._db.add_training_log( + task_id, "INFO", + f"Exported {export_result['total_images']} images for training", + ) + + # Run YOLO training + result = self._run_yolo_training( + task_id=task_id, + model_name=model_name, + data_yaml=data_yaml, + epochs=epochs, + batch_size=batch_size, + image_size=image_size, + learning_rate=learning_rate, + device=device, + project_name=project_name, + ) + + # Update task with results + self._db.update_training_task_status( + task_id=task_id, + status="completed", + result_metrics=result.get("metrics"), + model_path=result.get("model_path"), + ) + self._db.add_training_log(task_id, "INFO", "Training completed successfully") + + except Exception as e: + logger.error(f"Training task {task_id} failed: {e}") + self._db.add_training_log(task_id, "ERROR", f"Training failed: {e}") + raise + + def _export_training_data(self, task_id: str) -> dict[str, Any] | None: + """Export training data for a task.""" + from pathlib import Path + import shutil + from src.data.admin_models import FIELD_CLASSES + + # Get all labeled documents + documents = self._db.get_labeled_documents_for_export() + + if not documents: + self._db.add_training_log(task_id, "ERROR", "No labeled documents available") + return None + + # Create export directory + export_dir = Path("data/training") / task_id + export_dir.mkdir(parents=True, exist_ok=True) + + # YOLO format directories + (export_dir / "images" / "train").mkdir(parents=True, exist_ok=True) + (export_dir / "images" / "val").mkdir(parents=True, exist_ok=True) + (export_dir / "labels" / "train").mkdir(parents=True, exist_ok=True) + (export_dir / "labels" / "val").mkdir(parents=True, exist_ok=True) + + # 80/20 train/val split + total_docs = len(documents) + train_count = int(total_docs * 0.8) + train_docs = documents[:train_count] + val_docs = documents[train_count:] + + total_images = 0 + total_annotations = 0 + + # Export documents + for split, docs in [("train", train_docs), ("val", val_docs)]: + for doc in docs: + annotations = self._db.get_annotations_for_document(str(doc.document_id)) + + if not annotations: + continue + + for page_num in range(1, doc.page_count + 1): + page_annotations = [a for a in annotations if a.page_number == page_num] + + # Copy image + src_image = Path("data/admin_images") / str(doc.document_id) / f"page_{page_num}.png" + if not src_image.exists(): + continue + + image_name = f"{doc.document_id}_page{page_num}.png" + dst_image = export_dir / "images" / split / image_name + shutil.copy(src_image, dst_image) + total_images += 1 + + # Write YOLO label + label_name = f"{doc.document_id}_page{page_num}.txt" + label_path = export_dir / "labels" / split / label_name + + with open(label_path, "w") as f: + for ann in page_annotations: + line = f"{ann.class_id} {ann.x_center:.6f} {ann.y_center:.6f} {ann.width:.6f} {ann.height:.6f}\n" + f.write(line) + total_annotations += 1 + + # Create data.yaml + yaml_path = export_dir / "data.yaml" + yaml_content = f"""path: {export_dir.absolute()} +train: images/train +val: images/val + +nc: {len(FIELD_CLASSES)} +names: {list(FIELD_CLASSES.values())} +""" + yaml_path.write_text(yaml_content) + + return { + "data_yaml": str(yaml_path), + "total_images": total_images, + "total_annotations": total_annotations, + } + + def _run_yolo_training( + self, + task_id: str, + model_name: str, + data_yaml: str, + epochs: int, + batch_size: int, + image_size: int, + learning_rate: float, + device: str, + project_name: str, + ) -> dict[str, Any]: + """Run YOLO training.""" + try: + from ultralytics import YOLO + + # Log training start + self._db.add_training_log( + task_id, "INFO", + f"Starting YOLO training: model={model_name}, epochs={epochs}, batch={batch_size}", + ) + + # Load model + model = YOLO(model_name) + + # Train + results = model.train( + data=data_yaml, + epochs=epochs, + batch=batch_size, + imgsz=image_size, + lr0=learning_rate, + device=device, + project=f"runs/train/{project_name}", + name=f"task_{task_id[:8]}", + exist_ok=True, + verbose=True, + ) + + # Get best model path + best_model = Path(results.save_dir) / "weights" / "best.pt" + + # Extract metrics + metrics = {} + if hasattr(results, "results_dict"): + metrics = { + "mAP50": results.results_dict.get("metrics/mAP50(B)", 0), + "mAP50-95": results.results_dict.get("metrics/mAP50-95(B)", 0), + "precision": results.results_dict.get("metrics/precision(B)", 0), + "recall": results.results_dict.get("metrics/recall(B)", 0), + } + + self._db.add_training_log( + task_id, "INFO", + f"Training completed. mAP@0.5: {metrics.get('mAP50', 'N/A')}", + ) + + return { + "model_path": str(best_model) if best_model.exists() else None, + "metrics": metrics, + } + + except ImportError: + self._db.add_training_log(task_id, "ERROR", "Ultralytics not installed") + raise ValueError("Ultralytics (YOLO) not installed") + except Exception as e: + self._db.add_training_log(task_id, "ERROR", f"YOLO training failed: {e}") + raise + + +# Global scheduler instance +_scheduler: TrainingScheduler | None = None + + +def get_training_scheduler() -> TrainingScheduler: + """Get the training scheduler instance.""" + global _scheduler + if _scheduler is None: + _scheduler = TrainingScheduler() + return _scheduler + + +def start_scheduler() -> None: + """Start the global training scheduler.""" + scheduler = get_training_scheduler() + scheduler.start() + + +def stop_scheduler() -> None: + """Stop the global training scheduler.""" + global _scheduler + if _scheduler: + _scheduler.stop() + _scheduler = None diff --git a/src/web/dependencies.py b/src/web/dependencies.py new file mode 100644 index 0000000..e33755a --- /dev/null +++ b/src/web/dependencies.py @@ -0,0 +1,133 @@ +""" +FastAPI Dependencies + +Dependency injection for the async API endpoints. +""" + +import logging +from typing import Annotated + +from fastapi import Depends, Header, HTTPException, Request + +from src.data.async_request_db import AsyncRequestDB +from src.web.rate_limiter import RateLimiter + +logger = logging.getLogger(__name__) + +# Global instances (initialized in app startup) +_async_db: AsyncRequestDB | None = None +_rate_limiter: RateLimiter | None = None + + +def init_dependencies(db: AsyncRequestDB, rate_limiter: RateLimiter) -> None: + """Initialize global dependency instances.""" + global _async_db, _rate_limiter + _async_db = db + _rate_limiter = rate_limiter + + +def get_async_db() -> AsyncRequestDB: + """Get async request database instance.""" + if _async_db is None: + raise RuntimeError("AsyncRequestDB not initialized") + return _async_db + + +def get_rate_limiter() -> RateLimiter: + """Get rate limiter instance.""" + if _rate_limiter is None: + raise RuntimeError("RateLimiter not initialized") + return _rate_limiter + + +async def verify_api_key( + x_api_key: Annotated[str | None, Header()] = None, +) -> str: + """ + Verify API key exists and is active. + + Raises: + HTTPException: 401 if API key is missing or invalid + """ + if not x_api_key: + raise HTTPException( + status_code=401, + detail="X-API-Key header is required", + headers={"WWW-Authenticate": "API-Key"}, + ) + + db = get_async_db() + if not db.is_valid_api_key(x_api_key): + raise HTTPException( + status_code=401, + detail="Invalid or inactive API key", + headers={"WWW-Authenticate": "API-Key"}, + ) + + # Update usage tracking + try: + db.update_api_key_usage(x_api_key) + except Exception as e: + logger.warning(f"Failed to update API key usage: {e}") + + return x_api_key + + +async def check_submit_rate_limit( + api_key: Annotated[str, Depends(verify_api_key)], +) -> str: + """ + Check rate limit before processing submit request. + + Raises: + HTTPException: 429 if rate limit exceeded + """ + rate_limiter = get_rate_limiter() + status = rate_limiter.check_submit_limit(api_key) + + if not status.allowed: + headers = rate_limiter.get_rate_limit_headers(status) + raise HTTPException( + status_code=429, + detail=status.reason or "Rate limit exceeded", + headers=headers, + ) + + return api_key + + +async def check_poll_rate_limit( + request: Request, + api_key: Annotated[str, Depends(verify_api_key)], +) -> str: + """ + Check poll rate limit to prevent abuse. + + Raises: + HTTPException: 429 if polling too frequently + """ + # Extract request_id from path parameters + request_id = request.path_params.get("request_id") + if not request_id: + return api_key # No request_id, skip poll limit check + + rate_limiter = get_rate_limiter() + status = rate_limiter.check_poll_limit(api_key, request_id) + + if not status.allowed: + headers = rate_limiter.get_rate_limit_headers(status) + raise HTTPException( + status_code=429, + detail=status.reason or "Polling too frequently", + headers=headers, + ) + + return api_key + + +# Type aliases for cleaner route signatures +ApiKeyDep = Annotated[str, Depends(verify_api_key)] +SubmitRateLimitDep = Annotated[str, Depends(check_submit_rate_limit)] +PollRateLimitDep = Annotated[str, Depends(check_poll_rate_limit)] +AsyncDBDep = Annotated[AsyncRequestDB, Depends(get_async_db)] +RateLimiterDep = Annotated[RateLimiter, Depends(get_rate_limiter)] diff --git a/src/web/rate_limiter.py b/src/web/rate_limiter.py new file mode 100644 index 0000000..95297a9 --- /dev/null +++ b/src/web/rate_limiter.py @@ -0,0 +1,211 @@ +""" +Rate Limiter Implementation + +Thread-safe rate limiter with sliding window algorithm for API key-based limiting. +""" + +import logging +import time +from collections import defaultdict +from dataclasses import dataclass +from datetime import datetime, timedelta +from threading import Lock +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from src.data.async_request_db import AsyncRequestDB + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True) +class RateLimitConfig: + """Rate limit configuration for an API key.""" + + requests_per_minute: int = 10 + max_concurrent_jobs: int = 3 + min_poll_interval_ms: int = 1000 # Minimum time between status polls + + +@dataclass +class RateLimitStatus: + """Current rate limit status.""" + + allowed: bool + remaining_requests: int + reset_at: datetime + retry_after_seconds: int | None = None + reason: str | None = None + + +class RateLimiter: + """ + Thread-safe rate limiter with sliding window algorithm. + + Tracks: + - Requests per minute (sliding window) + - Concurrent active jobs + - Poll frequency per request_id + """ + + def __init__(self, db: "AsyncRequestDB") -> None: + self._db = db + self._lock = Lock() + # In-memory tracking for fast checks + self._request_windows: dict[str, list[float]] = defaultdict(list) + # (api_key, request_id) -> last_poll timestamp + self._poll_timestamps: dict[tuple[str, str], float] = {} + # Cache for API key configs (TTL 60 seconds) + self._config_cache: dict[str, tuple[RateLimitConfig, float]] = {} + self._config_cache_ttl = 60.0 + + def check_submit_limit(self, api_key: str) -> RateLimitStatus: + """Check if API key can submit a new request.""" + config = self._get_config(api_key) + + with self._lock: + now = time.time() + window_start = now - 60 # 1 minute window + + # Clean old entries + self._request_windows[api_key] = [ + ts for ts in self._request_windows[api_key] + if ts > window_start + ] + + current_count = len(self._request_windows[api_key]) + + if current_count >= config.requests_per_minute: + oldest = min(self._request_windows[api_key]) + retry_after = int(oldest + 60 - now) + 1 + return RateLimitStatus( + allowed=False, + remaining_requests=0, + reset_at=datetime.utcnow() + timedelta(seconds=retry_after), + retry_after_seconds=max(1, retry_after), + reason="Rate limit exceeded: too many requests per minute", + ) + + # Check concurrent jobs (query database) - inside lock for thread safety + active_jobs = self._db.count_active_jobs(api_key) + if active_jobs >= config.max_concurrent_jobs: + return RateLimitStatus( + allowed=False, + remaining_requests=config.requests_per_minute - current_count, + reset_at=datetime.utcnow() + timedelta(seconds=30), + retry_after_seconds=30, + reason=f"Max concurrent jobs ({config.max_concurrent_jobs}) reached", + ) + + return RateLimitStatus( + allowed=True, + remaining_requests=config.requests_per_minute - current_count - 1, + reset_at=datetime.utcnow() + timedelta(seconds=60), + ) + + def record_request(self, api_key: str) -> None: + """Record a successful request submission.""" + with self._lock: + self._request_windows[api_key].append(time.time()) + + # Also record in database for persistence + try: + self._db.record_rate_limit_event(api_key, "request") + except Exception as e: + logger.warning(f"Failed to record rate limit event: {e}") + + def check_poll_limit(self, api_key: str, request_id: str) -> RateLimitStatus: + """Check if polling is allowed (prevent abuse).""" + config = self._get_config(api_key) + key = (api_key, request_id) + + with self._lock: + now = time.time() + last_poll = self._poll_timestamps.get(key, 0) + elapsed_ms = (now - last_poll) * 1000 + + if elapsed_ms < config.min_poll_interval_ms: + # Suggest exponential backoff + wait_ms = min( + config.min_poll_interval_ms * 2, + 5000, # Max 5 seconds + ) + retry_after = int(wait_ms / 1000) + 1 + return RateLimitStatus( + allowed=False, + remaining_requests=0, + reset_at=datetime.utcnow() + timedelta(milliseconds=wait_ms), + retry_after_seconds=retry_after, + reason="Polling too frequently. Please wait before retrying.", + ) + + # Update poll timestamp + self._poll_timestamps[key] = now + + return RateLimitStatus( + allowed=True, + remaining_requests=999, # No limit on poll count, just frequency + reset_at=datetime.utcnow(), + ) + + def _get_config(self, api_key: str) -> RateLimitConfig: + """Get rate limit config for API key with caching.""" + now = time.time() + + # Check cache + if api_key in self._config_cache: + cached_config, cached_at = self._config_cache[api_key] + if now - cached_at < self._config_cache_ttl: + return cached_config + + # Query database + db_config = self._db.get_api_key_config(api_key) + if db_config: + config = RateLimitConfig( + requests_per_minute=db_config.requests_per_minute, + max_concurrent_jobs=db_config.max_concurrent_jobs, + ) + else: + config = RateLimitConfig() # Default limits + + # Cache result + self._config_cache[api_key] = (config, now) + return config + + def cleanup_poll_timestamps(self, max_age_seconds: int = 3600) -> int: + """Clean up old poll timestamps to prevent memory leak.""" + with self._lock: + now = time.time() + cutoff = now - max_age_seconds + old_keys = [ + k for k, v in self._poll_timestamps.items() + if v < cutoff + ] + for key in old_keys: + del self._poll_timestamps[key] + return len(old_keys) + + def cleanup_request_windows(self) -> None: + """Clean up expired entries from request windows.""" + with self._lock: + now = time.time() + window_start = now - 60 + + for api_key in list(self._request_windows.keys()): + self._request_windows[api_key] = [ + ts for ts in self._request_windows[api_key] + if ts > window_start + ] + # Remove empty entries + if not self._request_windows[api_key]: + del self._request_windows[api_key] + + def get_rate_limit_headers(self, status: RateLimitStatus) -> dict[str, str]: + """Generate rate limit headers for HTTP response.""" + headers = { + "X-RateLimit-Remaining": str(status.remaining_requests), + "X-RateLimit-Reset": status.reset_at.isoformat(), + } + if status.retry_after_seconds: + headers["Retry-After"] = str(status.retry_after_seconds) + return headers diff --git a/src/web/schemas.py b/src/web/schemas.py deleted file mode 100644 index f7ed47d..0000000 --- a/src/web/schemas.py +++ /dev/null @@ -1,86 +0,0 @@ -""" -API Request/Response Schemas - -Pydantic models for API validation and serialization. -""" - -from pydantic import BaseModel, Field -from typing import Any - - -class DetectionResult(BaseModel): - """Single detection result.""" - - field: str = Field(..., description="Field type (e.g., invoice_number, amount)") - confidence: float = Field(..., ge=0, le=1, description="Detection confidence") - bbox: list[float] = Field(..., description="Bounding box [x1, y1, x2, y2]") - - -class ExtractedField(BaseModel): - """Extracted and normalized field value.""" - - field_name: str = Field(..., description="Field name") - value: str | None = Field(None, description="Extracted value") - confidence: float = Field(..., ge=0, le=1, description="Extraction confidence") - is_valid: bool = Field(True, description="Whether the value passed validation") - - -class InferenceResult(BaseModel): - """Complete inference result for a document.""" - - document_id: str = Field(..., description="Document identifier") - success: bool = Field(..., description="Whether inference succeeded") - document_type: str = Field( - default="invoice", description="Document type: 'invoice' or 'letter'" - ) - fields: dict[str, str | None] = Field( - default_factory=dict, description="Extracted field values" - ) - confidence: dict[str, float] = Field( - default_factory=dict, description="Confidence scores per field" - ) - detections: list[DetectionResult] = Field( - default_factory=list, description="Raw YOLO detections" - ) - processing_time_ms: float = Field(..., description="Processing time in milliseconds") - visualization_url: str | None = Field( - None, description="URL to visualization image" - ) - errors: list[str] = Field(default_factory=list, description="Error messages") - - -class InferenceResponse(BaseModel): - """API response for inference endpoint.""" - - status: str = Field(..., description="Response status: success or error") - message: str = Field(..., description="Response message") - result: InferenceResult | None = Field(None, description="Inference result") - - -class BatchInferenceResponse(BaseModel): - """API response for batch inference endpoint.""" - - status: str = Field(..., description="Response status") - message: str = Field(..., description="Response message") - total: int = Field(..., description="Total documents processed") - successful: int = Field(..., description="Number of successful extractions") - results: list[InferenceResult] = Field( - default_factory=list, description="Individual results" - ) - - -class HealthResponse(BaseModel): - """Health check response.""" - - status: str = Field(..., description="Service status") - model_loaded: bool = Field(..., description="Whether model is loaded") - gpu_available: bool = Field(..., description="Whether GPU is available") - version: str = Field(..., description="API version") - - -class ErrorResponse(BaseModel): - """Error response.""" - - status: str = Field(default="error", description="Error status") - message: str = Field(..., description="Error message") - detail: str | None = Field(None, description="Detailed error information") diff --git a/src/web/schemas/__init__.py b/src/web/schemas/__init__.py new file mode 100644 index 0000000..0cba086 --- /dev/null +++ b/src/web/schemas/__init__.py @@ -0,0 +1,11 @@ +""" +API Schemas + +Pydantic models for request/response validation. +""" + +# Import everything from sub-modules for backward compatibility +from src.web.schemas.common import * # noqa: F401, F403 +from src.web.schemas.admin import * # noqa: F401, F403 +from src.web.schemas.inference import * # noqa: F401, F403 +from src.web.schemas.labeling import * # noqa: F401, F403 diff --git a/src/web/schemas/admin.py b/src/web/schemas/admin.py new file mode 100644 index 0000000..4d28eeb --- /dev/null +++ b/src/web/schemas/admin.py @@ -0,0 +1,539 @@ +""" +Admin API Request/Response Schemas + +Pydantic models for admin API validation and serialization. +""" + +from datetime import datetime +from enum import Enum +from typing import Any + +from pydantic import BaseModel, Field + + +# ============================================================================= +# Enums +# ============================================================================= + + +class DocumentStatus(str, Enum): + """Document status enum.""" + + PENDING = "pending" + AUTO_LABELING = "auto_labeling" + LABELED = "labeled" + EXPORTED = "exported" + + +class AutoLabelStatus(str, Enum): + """Auto-labeling status enum.""" + + RUNNING = "running" + COMPLETED = "completed" + FAILED = "failed" + + +class TrainingStatus(str, Enum): + """Training task status enum.""" + + PENDING = "pending" + SCHEDULED = "scheduled" + RUNNING = "running" + COMPLETED = "completed" + FAILED = "failed" + CANCELLED = "cancelled" + + +class TrainingType(str, Enum): + """Training task type enum.""" + + TRAIN = "train" + FINETUNE = "finetune" + + +class AnnotationSource(str, Enum): + """Annotation source enum.""" + + MANUAL = "manual" + AUTO = "auto" + IMPORTED = "imported" + + +# ============================================================================= +# Auth Schemas +# ============================================================================= + + +class AdminTokenCreate(BaseModel): + """Request to create an admin token.""" + + name: str = Field(..., min_length=1, max_length=255, description="Token name") + expires_in_days: int | None = Field( + None, ge=1, le=365, description="Token expiration in days (optional)" + ) + + +class AdminTokenResponse(BaseModel): + """Response with created admin token.""" + + token: str = Field(..., description="Admin token") + name: str = Field(..., description="Token name") + expires_at: datetime | None = Field(None, description="Token expiration time") + message: str = Field(..., description="Status message") + + +# ============================================================================= +# Document Schemas +# ============================================================================= + + +class DocumentUploadResponse(BaseModel): + """Response for document upload.""" + + document_id: str = Field(..., description="Document UUID") + filename: str = Field(..., description="Original filename") + file_size: int = Field(..., ge=0, description="File size in bytes") + page_count: int = Field(..., ge=1, description="Number of pages") + status: DocumentStatus = Field(..., description="Document status") + auto_label_started: bool = Field( + default=False, description="Whether auto-labeling was started" + ) + message: str = Field(..., description="Status message") + + +class DocumentItem(BaseModel): + """Single document in list.""" + + document_id: str = Field(..., description="Document UUID") + filename: str = Field(..., description="Original filename") + file_size: int = Field(..., ge=0, description="File size in bytes") + page_count: int = Field(..., ge=1, description="Number of pages") + status: DocumentStatus = Field(..., description="Document status") + auto_label_status: AutoLabelStatus | None = Field( + None, description="Auto-labeling status" + ) + annotation_count: int = Field(default=0, ge=0, description="Number of annotations") + upload_source: str = Field(default="ui", description="Upload source (ui or api)") + batch_id: str | None = Field(None, description="Batch ID if uploaded via batch") + can_annotate: bool = Field(default=True, description="Whether document can be annotated") + created_at: datetime = Field(..., description="Creation timestamp") + updated_at: datetime = Field(..., description="Last update timestamp") + + +class DocumentListResponse(BaseModel): + """Response for document list.""" + + total: int = Field(..., ge=0, description="Total documents") + limit: int = Field(..., ge=1, description="Page size") + offset: int = Field(..., ge=0, description="Current offset") + documents: list[DocumentItem] = Field( + default_factory=list, description="Document list" + ) + + +class DocumentDetailResponse(BaseModel): + """Response for document detail.""" + + document_id: str = Field(..., description="Document UUID") + filename: str = Field(..., description="Original filename") + file_size: int = Field(..., ge=0, description="File size in bytes") + content_type: str = Field(..., description="MIME type") + page_count: int = Field(..., ge=1, description="Number of pages") + status: DocumentStatus = Field(..., description="Document status") + auto_label_status: AutoLabelStatus | None = Field( + None, description="Auto-labeling status" + ) + auto_label_error: str | None = Field(None, description="Auto-labeling error") + upload_source: str = Field(default="ui", description="Upload source (ui or api)") + batch_id: str | None = Field(None, description="Batch ID if uploaded via batch") + csv_field_values: dict[str, str] | None = Field( + None, description="CSV field values if uploaded via batch" + ) + can_annotate: bool = Field(default=True, description="Whether document can be annotated") + annotation_lock_until: datetime | None = Field( + None, description="Lock expiration time if document is locked" + ) + annotations: list["AnnotationItem"] = Field( + default_factory=list, description="Document annotations" + ) + image_urls: list[str] = Field( + default_factory=list, description="URLs to page images" + ) + training_history: list["TrainingHistoryItem"] = Field( + default_factory=list, description="Training tasks that used this document" + ) + created_at: datetime = Field(..., description="Creation timestamp") + updated_at: datetime = Field(..., description="Last update timestamp") + + +class DocumentStatsResponse(BaseModel): + """Document statistics response.""" + + total: int = Field(..., ge=0, description="Total documents") + pending: int = Field(default=0, ge=0, description="Pending documents") + auto_labeling: int = Field(default=0, ge=0, description="Auto-labeling documents") + labeled: int = Field(default=0, ge=0, description="Labeled documents") + exported: int = Field(default=0, ge=0, description="Exported documents") + + +# ============================================================================= +# Annotation Schemas +# ============================================================================= + + +class BoundingBox(BaseModel): + """Bounding box coordinates.""" + + x: int = Field(..., ge=0, description="X coordinate (pixels)") + y: int = Field(..., ge=0, description="Y coordinate (pixels)") + width: int = Field(..., ge=1, description="Width (pixels)") + height: int = Field(..., ge=1, description="Height (pixels)") + + +class AnnotationCreate(BaseModel): + """Request to create an annotation.""" + + page_number: int = Field(default=1, ge=1, description="Page number (1-indexed)") + class_id: int = Field(..., ge=0, le=9, description="Class ID (0-9)") + bbox: BoundingBox = Field(..., description="Bounding box in pixels") + text_value: str | None = Field(None, description="Text value (optional)") + + +class AnnotationUpdate(BaseModel): + """Request to update an annotation.""" + + class_id: int | None = Field(None, ge=0, le=9, description="New class ID") + bbox: BoundingBox | None = Field(None, description="New bounding box") + text_value: str | None = Field(None, description="New text value") + + +class AnnotationItem(BaseModel): + """Single annotation item.""" + + annotation_id: str = Field(..., description="Annotation UUID") + page_number: int = Field(..., ge=1, description="Page number") + class_id: int = Field(..., ge=0, le=9, description="Class ID") + class_name: str = Field(..., description="Class name") + bbox: BoundingBox = Field(..., description="Bounding box in pixels") + normalized_bbox: dict[str, float] = Field( + ..., description="Normalized bbox (x_center, y_center, width, height)" + ) + text_value: str | None = Field(None, description="Text value") + confidence: float | None = Field(None, ge=0, le=1, description="Confidence score") + source: AnnotationSource = Field(..., description="Annotation source") + created_at: datetime = Field(..., description="Creation timestamp") + + +class AnnotationResponse(BaseModel): + """Response for annotation operation.""" + + annotation_id: str = Field(..., description="Annotation UUID") + message: str = Field(..., description="Status message") + + +class AnnotationListResponse(BaseModel): + """Response for annotation list.""" + + document_id: str = Field(..., description="Document UUID") + page_count: int = Field(..., ge=1, description="Total pages") + total_annotations: int = Field(..., ge=0, description="Total annotations") + annotations: list[AnnotationItem] = Field( + default_factory=list, description="Annotation list" + ) + + +class AnnotationLockRequest(BaseModel): + """Request to acquire annotation lock.""" + + duration_seconds: int = Field( + default=300, + ge=60, + le=3600, + description="Lock duration in seconds (60-3600)", + ) + + +class AnnotationLockResponse(BaseModel): + """Response for annotation lock operation.""" + + document_id: str = Field(..., description="Document UUID") + locked: bool = Field(..., description="Whether lock was acquired/released") + lock_expires_at: datetime | None = Field( + None, description="Lock expiration time" + ) + message: str = Field(..., description="Status message") + + +class AutoLabelRequest(BaseModel): + """Request to trigger auto-labeling.""" + + field_values: dict[str, str] = Field( + ..., + description="Field values to match (e.g., {'invoice_number': '12345'})", + ) + replace_existing: bool = Field( + default=False, description="Replace existing auto annotations" + ) + + +class AutoLabelResponse(BaseModel): + """Response for auto-labeling.""" + + document_id: str = Field(..., description="Document UUID") + status: str = Field(..., description="Auto-labeling status") + annotations_created: int = Field( + default=0, ge=0, description="Number of annotations created" + ) + message: str = Field(..., description="Status message") + + +# ============================================================================= +# Training Schemas +# ============================================================================= + + +class TrainingConfig(BaseModel): + """Training configuration.""" + + model_name: str = Field(default="yolo11n.pt", description="Base model name") + epochs: int = Field(default=100, ge=1, le=1000, description="Training epochs") + batch_size: int = Field(default=16, ge=1, le=128, description="Batch size") + image_size: int = Field(default=640, ge=320, le=1280, description="Image size") + learning_rate: float = Field(default=0.01, gt=0, le=1, description="Learning rate") + device: str = Field(default="0", description="Device (0 for GPU, cpu for CPU)") + project_name: str = Field( + default="invoice_fields", description="Training project name" + ) + + +class TrainingTaskCreate(BaseModel): + """Request to create a training task.""" + + name: str = Field(..., min_length=1, max_length=255, description="Task name") + description: str | None = Field(None, max_length=1000, description="Description") + task_type: TrainingType = Field( + default=TrainingType.TRAIN, description="Task type" + ) + config: TrainingConfig = Field( + default_factory=TrainingConfig, description="Training configuration" + ) + scheduled_at: datetime | None = Field( + None, description="Scheduled execution time" + ) + cron_expression: str | None = Field( + None, max_length=50, description="Cron expression for recurring tasks" + ) + + +class TrainingTaskItem(BaseModel): + """Single training task in list.""" + + task_id: str = Field(..., description="Task UUID") + name: str = Field(..., description="Task name") + task_type: TrainingType = Field(..., description="Task type") + status: TrainingStatus = Field(..., description="Task status") + scheduled_at: datetime | None = Field(None, description="Scheduled time") + is_recurring: bool = Field(default=False, description="Is recurring task") + started_at: datetime | None = Field(None, description="Start time") + completed_at: datetime | None = Field(None, description="Completion time") + created_at: datetime = Field(..., description="Creation timestamp") + + +class TrainingTaskListResponse(BaseModel): + """Response for training task list.""" + + total: int = Field(..., ge=0, description="Total tasks") + limit: int = Field(..., ge=1, description="Page size") + offset: int = Field(..., ge=0, description="Current offset") + tasks: list[TrainingTaskItem] = Field(default_factory=list, description="Task list") + + +class TrainingTaskDetailResponse(BaseModel): + """Response for training task detail.""" + + task_id: str = Field(..., description="Task UUID") + name: str = Field(..., description="Task name") + description: str | None = Field(None, description="Description") + task_type: TrainingType = Field(..., description="Task type") + status: TrainingStatus = Field(..., description="Task status") + config: dict[str, Any] | None = Field(None, description="Training configuration") + scheduled_at: datetime | None = Field(None, description="Scheduled time") + cron_expression: str | None = Field(None, description="Cron expression") + is_recurring: bool = Field(default=False, description="Is recurring task") + started_at: datetime | None = Field(None, description="Start time") + completed_at: datetime | None = Field(None, description="Completion time") + error_message: str | None = Field(None, description="Error message") + result_metrics: dict[str, Any] | None = Field(None, description="Result metrics") + model_path: str | None = Field(None, description="Trained model path") + created_at: datetime = Field(..., description="Creation timestamp") + + +class TrainingTaskResponse(BaseModel): + """Response for training task operation.""" + + task_id: str = Field(..., description="Task UUID") + status: TrainingStatus = Field(..., description="Task status") + message: str = Field(..., description="Status message") + + +class TrainingLogItem(BaseModel): + """Single training log entry.""" + + level: str = Field(..., description="Log level") + message: str = Field(..., description="Log message") + details: dict[str, Any] | None = Field(None, description="Additional details") + created_at: datetime = Field(..., description="Timestamp") + + +class TrainingLogsResponse(BaseModel): + """Response for training logs.""" + + task_id: str = Field(..., description="Task UUID") + logs: list[TrainingLogItem] = Field(default_factory=list, description="Log entries") + + +# ============================================================================= +# Export Schemas +# ============================================================================= + + +class ExportRequest(BaseModel): + """Request to export annotations.""" + + format: str = Field( + default="yolo", description="Export format (yolo, coco, voc)" + ) + include_images: bool = Field( + default=True, description="Include images in export" + ) + split_ratio: float = Field( + default=0.8, ge=0.5, le=1.0, description="Train/val split ratio" + ) + + +class ExportResponse(BaseModel): + """Response for export operation.""" + + status: str = Field(..., description="Export status") + export_path: str = Field(..., description="Path to exported dataset") + total_images: int = Field(..., ge=0, description="Total images exported") + total_annotations: int = Field(..., ge=0, description="Total annotations") + train_count: int = Field(..., ge=0, description="Training set count") + val_count: int = Field(..., ge=0, description="Validation set count") + message: str = Field(..., description="Status message") + + +# ============================================================================= +# Phase 4 & 5: Training Data Management and Annotation Enhancement +# ============================================================================= + + +class TrainingDocumentItem(BaseModel): + """Document item for training page.""" + + document_id: str = Field(..., description="Document UUID") + filename: str = Field(..., description="Filename") + annotation_count: int = Field(..., ge=0, description="Total annotations") + annotation_sources: dict[str, int] = Field( + ..., description="Annotation counts by source (manual, auto)" + ) + used_in_training: list[str] = Field( + default_factory=list, description="List of training task IDs that used this document" + ) + last_modified: datetime = Field(..., description="Last modification time") + + +class TrainingDocumentsResponse(BaseModel): + """Response for GET /admin/training/documents.""" + + total: int = Field(..., ge=0, description="Total document count") + limit: int = Field(..., ge=1, le=100, description="Page size") + offset: int = Field(..., ge=0, description="Pagination offset") + documents: list[TrainingDocumentItem] = Field( + default_factory=list, description="Documents available for training" + ) + + +class ModelMetrics(BaseModel): + """Training model metrics.""" + + mAP: float | None = Field(None, ge=0.0, le=1.0, description="Mean Average Precision") + precision: float | None = Field(None, ge=0.0, le=1.0, description="Precision") + recall: float | None = Field(None, ge=0.0, le=1.0, description="Recall") + + +class TrainingModelItem(BaseModel): + """Trained model item for model list.""" + + task_id: str = Field(..., description="Training task UUID") + name: str = Field(..., description="Model name") + status: TrainingStatus = Field(..., description="Training status") + document_count: int = Field(..., ge=0, description="Documents used in training") + created_at: datetime = Field(..., description="Creation timestamp") + completed_at: datetime | None = Field(None, description="Completion timestamp") + metrics: ModelMetrics = Field(..., description="Model metrics") + model_path: str | None = Field(None, description="Path to model weights") + download_url: str | None = Field(None, description="Download URL for model") + + +class TrainingModelsResponse(BaseModel): + """Response for GET /admin/training/models.""" + + total: int = Field(..., ge=0, description="Total model count") + limit: int = Field(..., ge=1, le=100, description="Page size") + offset: int = Field(..., ge=0, description="Pagination offset") + models: list[TrainingModelItem] = Field( + default_factory=list, description="Trained models" + ) + + +class AnnotationVerifyRequest(BaseModel): + """Request to verify an annotation.""" + + pass # No body needed, just POST to verify + + +class AnnotationVerifyResponse(BaseModel): + """Response for annotation verification.""" + + annotation_id: str = Field(..., description="Annotation UUID") + is_verified: bool = Field(..., description="Verification status") + verified_at: datetime = Field(..., description="Verification timestamp") + verified_by: str = Field(..., description="Admin token who verified") + message: str = Field(..., description="Status message") + + +class AnnotationOverrideRequest(BaseModel): + """Request to override an annotation.""" + + bbox: dict[str, int] | None = Field( + None, description="Updated bounding box {x, y, width, height}" + ) + text_value: str | None = Field(None, description="Updated text value") + class_id: int | None = Field(None, ge=0, le=9, description="Updated class ID") + class_name: str | None = Field(None, description="Updated class name") + reason: str | None = Field(None, description="Reason for override") + + +class AnnotationOverrideResponse(BaseModel): + """Response for annotation override.""" + + annotation_id: str = Field(..., description="Annotation UUID") + source: str = Field(..., description="New source (manual)") + override_source: str | None = Field(None, description="Original source (auto)") + original_annotation_id: str | None = Field(None, description="Original annotation ID") + message: str = Field(..., description="Status message") + history_id: str = Field(..., description="History record UUID") + + +class TrainingHistoryItem(BaseModel): + """Training history for a document.""" + + task_id: str = Field(..., description="Training task UUID") + name: str = Field(..., description="Training task name") + trained_at: datetime = Field(..., description="Training timestamp") + model_metrics: ModelMetrics | None = Field(None, description="Model metrics") + + +# Forward reference update +DocumentDetailResponse.model_rebuild() diff --git a/src/web/schemas/common.py b/src/web/schemas/common.py new file mode 100644 index 0000000..a25e6ca --- /dev/null +++ b/src/web/schemas/common.py @@ -0,0 +1,15 @@ +""" +Common Schemas + +Shared Pydantic models used across multiple API modules. +""" + +from pydantic import BaseModel, Field + + +class ErrorResponse(BaseModel): + """Error response.""" + + status: str = Field(default="error", description="Error status") + message: str = Field(..., description="Error message") + detail: str | None = Field(None, description="Detailed error information") diff --git a/src/web/schemas/inference.py b/src/web/schemas/inference.py new file mode 100644 index 0000000..2671638 --- /dev/null +++ b/src/web/schemas/inference.py @@ -0,0 +1,196 @@ +""" +API Request/Response Schemas + +Pydantic models for API validation and serialization. +""" + +from datetime import datetime +from enum import Enum + +from pydantic import BaseModel, Field + + +# ============================================================================= +# Enums +# ============================================================================= + + +class AsyncStatus(str, Enum): + """Async request status enum.""" + + PENDING = "pending" + PROCESSING = "processing" + COMPLETED = "completed" + FAILED = "failed" + + +# ============================================================================= +# Sync API Schemas (existing) +# ============================================================================= + + +class DetectionResult(BaseModel): + """Single detection result.""" + + field: str = Field(..., description="Field type (e.g., invoice_number, amount)") + confidence: float = Field(..., ge=0, le=1, description="Detection confidence") + bbox: list[float] = Field(..., description="Bounding box [x1, y1, x2, y2]") + + +class ExtractedField(BaseModel): + """Extracted and normalized field value.""" + + field_name: str = Field(..., description="Field name") + value: str | None = Field(None, description="Extracted value") + confidence: float = Field(..., ge=0, le=1, description="Extraction confidence") + is_valid: bool = Field(True, description="Whether the value passed validation") + + +class InferenceResult(BaseModel): + """Complete inference result for a document.""" + + document_id: str = Field(..., description="Document identifier") + success: bool = Field(..., description="Whether inference succeeded") + document_type: str = Field( + default="invoice", description="Document type: 'invoice' or 'letter'" + ) + fields: dict[str, str | None] = Field( + default_factory=dict, description="Extracted field values" + ) + confidence: dict[str, float] = Field( + default_factory=dict, description="Confidence scores per field" + ) + detections: list[DetectionResult] = Field( + default_factory=list, description="Raw YOLO detections" + ) + processing_time_ms: float = Field(..., description="Processing time in milliseconds") + visualization_url: str | None = Field( + None, description="URL to visualization image" + ) + errors: list[str] = Field(default_factory=list, description="Error messages") + + +class InferenceResponse(BaseModel): + """API response for inference endpoint.""" + + status: str = Field(..., description="Response status: success or error") + message: str = Field(..., description="Response message") + result: InferenceResult | None = Field(None, description="Inference result") + + +class BatchInferenceResponse(BaseModel): + """API response for batch inference endpoint.""" + + status: str = Field(..., description="Response status") + message: str = Field(..., description="Response message") + total: int = Field(..., description="Total documents processed") + successful: int = Field(..., description="Number of successful extractions") + results: list[InferenceResult] = Field( + default_factory=list, description="Individual results" + ) + + +class HealthResponse(BaseModel): + """Health check response.""" + + status: str = Field(..., description="Service status") + model_loaded: bool = Field(..., description="Whether model is loaded") + gpu_available: bool = Field(..., description="Whether GPU is available") + version: str = Field(..., description="API version") + + +class ErrorResponse(BaseModel): + """Error response.""" + + status: str = Field(default="error", description="Error status") + message: str = Field(..., description="Error message") + detail: str | None = Field(None, description="Detailed error information") + + +# ============================================================================= +# Async API Schemas +# ============================================================================= + + +class AsyncSubmitResponse(BaseModel): + """Response for async submit endpoint.""" + + status: str = Field(default="accepted", description="Response status") + message: str = Field(..., description="Response message") + request_id: str = Field(..., description="Unique request identifier (UUID)") + estimated_wait_seconds: int = Field( + ..., ge=0, description="Estimated wait time in seconds" + ) + poll_url: str = Field(..., description="URL to poll for status updates") + + +class AsyncStatusResponse(BaseModel): + """Response for async status endpoint.""" + + request_id: str = Field(..., description="Unique request identifier") + status: AsyncStatus = Field(..., description="Current processing status") + filename: str = Field(..., description="Original filename") + created_at: datetime = Field(..., description="Request creation timestamp") + started_at: datetime | None = Field( + None, description="Processing start timestamp" + ) + completed_at: datetime | None = Field( + None, description="Processing completion timestamp" + ) + position_in_queue: int | None = Field( + None, description="Position in queue (for pending status)" + ) + error_message: str | None = Field( + None, description="Error message (for failed status)" + ) + result_url: str | None = Field( + None, description="URL to fetch results (for completed status)" + ) + + +class AsyncResultResponse(BaseModel): + """Response for async result endpoint.""" + + request_id: str = Field(..., description="Unique request identifier") + status: AsyncStatus = Field(..., description="Processing status") + processing_time_ms: float = Field( + ..., ge=0, description="Total processing time in milliseconds" + ) + result: InferenceResult | None = Field( + None, description="Extraction result (when completed)" + ) + visualization_url: str | None = Field( + None, description="URL to visualization image" + ) + + +class AsyncRequestItem(BaseModel): + """Single item in async requests list.""" + + request_id: str = Field(..., description="Unique request identifier") + status: AsyncStatus = Field(..., description="Current processing status") + filename: str = Field(..., description="Original filename") + file_size: int = Field(..., ge=0, description="File size in bytes") + created_at: datetime = Field(..., description="Request creation timestamp") + completed_at: datetime | None = Field( + None, description="Processing completion timestamp" + ) + + +class AsyncRequestsListResponse(BaseModel): + """Response for async requests list endpoint.""" + + total: int = Field(..., ge=0, description="Total number of requests") + limit: int = Field(..., ge=1, description="Maximum items per page") + offset: int = Field(..., ge=0, description="Current offset") + requests: list[AsyncRequestItem] = Field( + default_factory=list, description="List of requests" + ) + + +class RateLimitInfo(BaseModel): + """Rate limit information (included in headers).""" + + limit: int = Field(..., description="Maximum requests per minute") + remaining: int = Field(..., description="Remaining requests in current window") + reset_at: datetime = Field(..., description="Time when limit resets") diff --git a/src/web/schemas/labeling.py b/src/web/schemas/labeling.py new file mode 100644 index 0000000..56ebd0c --- /dev/null +++ b/src/web/schemas/labeling.py @@ -0,0 +1,13 @@ +""" +Labeling API Schemas + +Pydantic models for pre-labeling and label validation endpoints. +""" + +from pydantic import BaseModel, Field + + +class PreLabelResponse(BaseModel): + """API response for pre-label endpoint.""" + + document_id: str = Field(..., description="Document identifier for retrieving results") diff --git a/src/web/services/__init__.py b/src/web/services/__init__.py new file mode 100644 index 0000000..e20189a --- /dev/null +++ b/src/web/services/__init__.py @@ -0,0 +1,18 @@ +""" +Business Logic Services + +Service layer for processing requests and orchestrating data operations. +""" + +from src.web.services.autolabel import AutoLabelService, get_auto_label_service +from src.web.services.inference import InferenceService +from src.web.services.async_processing import AsyncProcessingService +from src.web.services.batch_upload import BatchUploadService + +__all__ = [ + "AutoLabelService", + "get_auto_label_service", + "InferenceService", + "AsyncProcessingService", + "BatchUploadService", +] diff --git a/src/web/services/async_processing.py b/src/web/services/async_processing.py new file mode 100644 index 0000000..54e2e08 --- /dev/null +++ b/src/web/services/async_processing.py @@ -0,0 +1,383 @@ +""" +Async Processing Service + +Manages async request lifecycle and background processing. +""" + +import logging +import shutil +import time +import uuid +from dataclasses import dataclass +from datetime import datetime, timedelta +from pathlib import Path +from threading import Event, Thread +from typing import TYPE_CHECKING + +from src.data.async_request_db import AsyncRequestDB +from src.web.workers.async_queue import AsyncTask, AsyncTaskQueue +from src.web.core.rate_limiter import RateLimiter + +if TYPE_CHECKING: + from src.web.config import AsyncConfig, StorageConfig + from src.web.services.inference import InferenceService + +logger = logging.getLogger(__name__) + + +@dataclass +class AsyncSubmitResult: + """Result from async submit operation.""" + + success: bool + request_id: str | None = None + estimated_wait_seconds: int = 0 + error: str | None = None + + +class AsyncProcessingService: + """ + Manages async request lifecycle and processing. + + Coordinates between: + - HTTP endpoints (submit/status/result) + - Background task queue + - Database storage + - Rate limiting + """ + + def __init__( + self, + inference_service: "InferenceService", + db: AsyncRequestDB, + queue: AsyncTaskQueue, + rate_limiter: RateLimiter, + async_config: "AsyncConfig", + storage_config: "StorageConfig", + ) -> None: + self._inference = inference_service + self._db = db + self._queue = queue + self._rate_limiter = rate_limiter + self._async_config = async_config + self._storage_config = storage_config + + # Cleanup thread + self._cleanup_stop_event = Event() + self._cleanup_thread: Thread | None = None + + def start(self) -> None: + """Start the async processing service.""" + # Start the task queue with our handler + self._queue.start(self._process_task) + + # Start cleanup thread + self._cleanup_stop_event.clear() + self._cleanup_thread = Thread( + target=self._cleanup_loop, + name="async-cleanup", + daemon=True, + ) + self._cleanup_thread.start() + logger.info("AsyncProcessingService started") + + def stop(self, timeout: float = 30.0) -> None: + """Stop the async processing service.""" + # Stop cleanup thread + self._cleanup_stop_event.set() + if self._cleanup_thread and self._cleanup_thread.is_alive(): + self._cleanup_thread.join(timeout=5.0) + + # Stop task queue + self._queue.stop(timeout=timeout) + logger.info("AsyncProcessingService stopped") + + def submit_request( + self, + api_key: str, + file_content: bytes, + filename: str, + content_type: str, + ) -> AsyncSubmitResult: + """ + Submit a new async processing request. + + Args: + api_key: API key for the request + file_content: File content as bytes + filename: Original filename + content_type: File content type + + Returns: + AsyncSubmitResult with request_id and status + """ + # Generate request ID + request_id = str(uuid.uuid4()) + + # Save file to temp storage + file_path = self._save_upload(request_id, filename, file_content) + file_size = len(file_content) + + try: + # Calculate expiration + expires_at = datetime.utcnow() + timedelta( + days=self._async_config.result_retention_days + ) + + # Create database record + self._db.create_request( + api_key=api_key, + filename=filename, + file_size=file_size, + content_type=content_type, + expires_at=expires_at, + request_id=request_id, + ) + + # Record rate limit event + self._rate_limiter.record_request(api_key) + + # Create and queue task + task = AsyncTask( + request_id=request_id, + api_key=api_key, + file_path=file_path, + filename=filename, + created_at=datetime.utcnow(), + ) + + if not self._queue.submit(task): + # Queue is full + self._db.update_status( + request_id, + "failed", + error_message="Processing queue is full", + ) + # Cleanup file + file_path.unlink(missing_ok=True) + return AsyncSubmitResult( + success=False, + request_id=request_id, + error="Processing queue is full. Please try again later.", + ) + + # Estimate wait time + estimated_wait = self._estimate_wait() + + return AsyncSubmitResult( + success=True, + request_id=request_id, + estimated_wait_seconds=estimated_wait, + ) + + except Exception as e: + logger.error(f"Failed to submit request: {e}", exc_info=True) + # Cleanup file on error + file_path.unlink(missing_ok=True) + return AsyncSubmitResult( + success=False, + # Return generic error message to avoid leaking implementation details + error="Failed to process request. Please try again later.", + ) + + # Allowed file extensions whitelist + ALLOWED_EXTENSIONS = frozenset({".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".tif"}) + + def _save_upload( + self, + request_id: str, + filename: str, + content: bytes, + ) -> Path: + """Save uploaded file to temp storage.""" + import re + + # Extract extension from filename + ext = Path(filename).suffix.lower() + + # Validate extension: must be alphanumeric only (e.g., .pdf, .png) + if not ext or not re.match(r'^\.[a-z0-9]+$', ext): + ext = ".pdf" + + # Validate against whitelist + if ext not in self.ALLOWED_EXTENSIONS: + ext = ".pdf" + + # Create async upload directory + upload_dir = self._async_config.temp_upload_dir + upload_dir.mkdir(parents=True, exist_ok=True) + + # Build file path - request_id is a UUID so it's safe + file_path = upload_dir / f"{request_id}{ext}" + + # Defense in depth: ensure path is within upload_dir + if not file_path.resolve().is_relative_to(upload_dir.resolve()): + raise ValueError("Invalid file path detected") + + file_path.write_bytes(content) + + return file_path + + def _process_task(self, task: AsyncTask) -> None: + """ + Process a single task (called by worker thread). + + This method is called by the AsyncTaskQueue worker threads. + """ + start_time = time.time() + + try: + # Update status to processing + self._db.update_status(task.request_id, "processing") + + # Ensure file exists + if not task.file_path.exists(): + raise FileNotFoundError(f"Upload file not found: {task.file_path}") + + # Run inference based on file type + file_ext = task.file_path.suffix.lower() + if file_ext == ".pdf": + result = self._inference.process_pdf( + task.file_path, + document_id=task.request_id[:8], + ) + else: + result = self._inference.process_image( + task.file_path, + document_id=task.request_id[:8], + ) + + # Calculate processing time + processing_time_ms = (time.time() - start_time) * 1000 + + # Prepare result for storage + result_data = { + "document_id": result.document_id, + "success": result.success, + "document_type": result.document_type, + "fields": result.fields, + "confidence": result.confidence, + "detections": result.detections, + "errors": result.errors, + } + + # Get visualization path as string + viz_path = None + if result.visualization_path: + viz_path = str(result.visualization_path.name) + + # Store result in database + self._db.complete_request( + request_id=task.request_id, + document_id=result.document_id, + result=result_data, + processing_time_ms=processing_time_ms, + visualization_path=viz_path, + ) + + logger.info( + f"Task {task.request_id} completed successfully " + f"in {processing_time_ms:.0f}ms" + ) + + except Exception as e: + logger.error( + f"Task {task.request_id} failed: {e}", + exc_info=True, + ) + self._db.update_status( + task.request_id, + "failed", + error_message=str(e), + increment_retry=True, + ) + + finally: + # Cleanup uploaded file + if task.file_path.exists(): + task.file_path.unlink(missing_ok=True) + + def _estimate_wait(self) -> int: + """Estimate wait time based on queue depth.""" + queue_depth = self._queue.get_queue_depth() + processing_count = self._queue.get_processing_count() + total_pending = queue_depth + processing_count + + # Estimate ~5 seconds per document + avg_processing_time = 5 + return total_pending * avg_processing_time + + def _cleanup_loop(self) -> None: + """Background cleanup loop.""" + logger.info("Cleanup thread started") + cleanup_interval = self._async_config.cleanup_interval_hours * 3600 + + while not self._cleanup_stop_event.wait(timeout=cleanup_interval): + try: + self._run_cleanup() + except Exception as e: + logger.error(f"Cleanup failed: {e}", exc_info=True) + + logger.info("Cleanup thread stopped") + + def _run_cleanup(self) -> None: + """Run cleanup operations.""" + logger.info("Running cleanup...") + + # Delete expired requests + deleted_requests = self._db.delete_expired_requests() + + # Reset stale processing requests + reset_count = self._db.reset_stale_processing_requests( + stale_minutes=self._async_config.task_timeout_seconds // 60, + max_retries=3, + ) + + # Cleanup old rate limit events + deleted_events = self._db.cleanup_old_rate_limit_events(hours=1) + + # Cleanup old poll timestamps + cleaned_polls = self._rate_limiter.cleanup_poll_timestamps() + + # Cleanup rate limiter request windows + self._rate_limiter.cleanup_request_windows() + + # Cleanup orphaned upload files + orphan_count = self._cleanup_orphan_files() + + logger.info( + f"Cleanup complete: {deleted_requests} expired requests, " + f"{reset_count} stale requests reset, " + f"{deleted_events} rate limit events, " + f"{cleaned_polls} poll timestamps, " + f"{orphan_count} orphan files" + ) + + def _cleanup_orphan_files(self) -> int: + """Clean up upload files that don't have matching requests.""" + upload_dir = self._async_config.temp_upload_dir + if not upload_dir.exists(): + return 0 + + count = 0 + # Files older than 1 hour without matching request are considered orphans + cutoff = time.time() - 3600 + + for file_path in upload_dir.iterdir(): + if not file_path.is_file(): + continue + + # Check if file is old enough + if file_path.stat().st_mtime > cutoff: + continue + + # Extract request_id from filename + request_id = file_path.stem + + # Check if request exists in database + request = self._db.get_request(request_id) + if request is None: + file_path.unlink(missing_ok=True) + count += 1 + + return count diff --git a/src/web/services/autolabel.py b/src/web/services/autolabel.py new file mode 100644 index 0000000..2d50380 --- /dev/null +++ b/src/web/services/autolabel.py @@ -0,0 +1,335 @@ +""" +Admin Auto-Labeling Service + +Uses FieldMatcher to automatically create annotations from field values. +""" + +import logging +from pathlib import Path +from typing import Any + +import numpy as np +from PIL import Image + +from src.config import DEFAULT_DPI +from src.data.admin_db import AdminDB +from src.data.admin_models import FIELD_CLASS_IDS, FIELD_CLASSES +from src.matcher.field_matcher import FieldMatcher +from src.ocr.paddle_ocr import OCREngine, OCRToken + +logger = logging.getLogger(__name__) + + +class AutoLabelService: + """Service for automatic document labeling using field matching.""" + + def __init__(self, ocr_engine: OCREngine | None = None): + """ + Initialize auto-label service. + + Args: + ocr_engine: OCR engine instance (creates one if not provided) + """ + self._ocr_engine = ocr_engine + self._field_matcher = FieldMatcher() + + @property + def ocr_engine(self) -> OCREngine: + """Lazy initialization of OCR engine.""" + if self._ocr_engine is None: + self._ocr_engine = OCREngine(lang="en") + return self._ocr_engine + + def auto_label_document( + self, + document_id: str, + file_path: str, + field_values: dict[str, str], + db: AdminDB, + replace_existing: bool = False, + skip_lock_check: bool = False, + ) -> dict[str, Any]: + """ + Auto-label a document using field matching. + + Args: + document_id: Document UUID + file_path: Path to document file + field_values: Dict of field_name -> value to match + db: Admin database instance + replace_existing: Whether to replace existing auto annotations + skip_lock_check: Skip annotation lock check (for batch processing) + + Returns: + Dict with status and annotation count + """ + try: + # Get document info first + document = db.get_document(document_id) + if document is None: + raise ValueError(f"Document not found: {document_id}") + + # Check annotation lock unless explicitly skipped + if not skip_lock_check: + from datetime import datetime, timezone + if hasattr(document, 'annotation_lock_until') and document.annotation_lock_until: + if document.annotation_lock_until > datetime.now(timezone.utc): + raise ValueError( + f"Document is locked for annotation until {document.annotation_lock_until}. " + "Auto-labeling skipped." + ) + + # Update status to running + db.update_document_status( + document_id=document_id, + status="auto_labeling", + auto_label_status="running", + ) + + # Delete existing auto annotations if requested + if replace_existing: + deleted = db.delete_annotations_for_document( + document_id=document_id, + source="auto", + ) + logger.info(f"Deleted {deleted} existing auto annotations") + + # Process document + path = Path(file_path) + annotations_created = 0 + + if path.suffix.lower() == ".pdf": + # Process PDF (all pages) + annotations_created = self._process_pdf( + document_id, path, field_values, db + ) + else: + # Process single image + annotations_created = self._process_image( + document_id, path, field_values, db, page_number=1 + ) + + # Update document status + status = "labeled" if annotations_created > 0 else "pending" + db.update_document_status( + document_id=document_id, + status=status, + auto_label_status="completed", + ) + + return { + "status": "completed", + "annotations_created": annotations_created, + } + + except Exception as e: + logger.error(f"Auto-labeling failed for {document_id}: {e}") + db.update_document_status( + document_id=document_id, + status="pending", + auto_label_status="failed", + auto_label_error=str(e), + ) + return { + "status": "failed", + "error": str(e), + "annotations_created": 0, + } + + def _process_pdf( + self, + document_id: str, + pdf_path: Path, + field_values: dict[str, str], + db: AdminDB, + ) -> int: + """Process PDF document and create annotations.""" + from src.pdf.renderer import render_pdf_to_images + import io + + total_annotations = 0 + + for page_no, image_bytes in render_pdf_to_images(pdf_path, dpi=DEFAULT_DPI): + # Convert to numpy array + image = Image.open(io.BytesIO(image_bytes)) + image_array = np.array(image) + + # Extract tokens + tokens = self.ocr_engine.extract_from_image( + image_array, + page_no=page_no, + ) + + # Find matches + annotations = self._find_annotations( + document_id, + tokens, + field_values, + page_number=page_no + 1, # 1-indexed + image_width=image_array.shape[1], + image_height=image_array.shape[0], + ) + + # Save annotations + if annotations: + db.create_annotations_batch(annotations) + total_annotations += len(annotations) + + return total_annotations + + def _process_image( + self, + document_id: str, + image_path: Path, + field_values: dict[str, str], + db: AdminDB, + page_number: int = 1, + ) -> int: + """Process single image and create annotations.""" + # Load image + image = Image.open(image_path) + image_array = np.array(image) + + # Extract tokens + tokens = self.ocr_engine.extract_from_image( + image_array, + page_no=0, + ) + + # Find matches + annotations = self._find_annotations( + document_id, + tokens, + field_values, + page_number=page_number, + image_width=image_array.shape[1], + image_height=image_array.shape[0], + ) + + # Save annotations + if annotations: + db.create_annotations_batch(annotations) + + return len(annotations) + + def _find_annotations( + self, + document_id: str, + tokens: list[OCRToken], + field_values: dict[str, str], + page_number: int, + image_width: int, + image_height: int, + ) -> list[dict[str, Any]]: + """Find annotations for field values using token matching.""" + from src.normalize import normalize_field + + annotations = [] + + for field_name, value in field_values.items(): + if not value or not value.strip(): + continue + + # Map field name to class ID + class_id = self._get_class_id(field_name) + if class_id is None: + logger.warning(f"Unknown field name: {field_name}") + continue + + class_name = FIELD_CLASSES[class_id] + + # Normalize value + try: + normalized_values = normalize_field(field_name, value) + except Exception as e: + logger.warning(f"Failed to normalize {field_name}={value}: {e}") + normalized_values = [value] + + # Find matches + matches = self._field_matcher.find_matches( + tokens=tokens, + field_name=field_name, + normalized_values=normalized_values, + page_no=page_number - 1, # 0-indexed for matcher + ) + + # Take best match + if matches: + best_match = matches[0] + bbox = best_match.bbox # (x0, y0, x1, y1) + + # Calculate normalized coordinates (YOLO format) + x_center = (bbox[0] + bbox[2]) / 2 / image_width + y_center = (bbox[1] + bbox[3]) / 2 / image_height + width = (bbox[2] - bbox[0]) / image_width + height = (bbox[3] - bbox[1]) / image_height + + # Pixel coordinates + bbox_x = int(bbox[0]) + bbox_y = int(bbox[1]) + bbox_width = int(bbox[2] - bbox[0]) + bbox_height = int(bbox[3] - bbox[1]) + + annotations.append({ + "document_id": document_id, + "page_number": page_number, + "class_id": class_id, + "class_name": class_name, + "x_center": x_center, + "y_center": y_center, + "width": width, + "height": height, + "bbox_x": bbox_x, + "bbox_y": bbox_y, + "bbox_width": bbox_width, + "bbox_height": bbox_height, + "text_value": best_match.matched_value, + "confidence": best_match.score, + "source": "auto", + }) + + return annotations + + def _get_class_id(self, field_name: str) -> int | None: + """Map field name to class ID.""" + # Direct match + if field_name in FIELD_CLASS_IDS: + return FIELD_CLASS_IDS[field_name] + + # Handle alternative names + name_mapping = { + "InvoiceNumber": "invoice_number", + "InvoiceDate": "invoice_date", + "InvoiceDueDate": "invoice_due_date", + "OCR": "ocr_number", + "Bankgiro": "bankgiro", + "Plusgiro": "plusgiro", + "Amount": "amount", + "supplier_organisation_number": "supplier_organisation_number", + "PaymentLine": "payment_line", + "customer_number": "customer_number", + } + + mapped_name = name_mapping.get(field_name) + if mapped_name and mapped_name in FIELD_CLASS_IDS: + return FIELD_CLASS_IDS[mapped_name] + + return None + + +# Global service instance +_auto_label_service: AutoLabelService | None = None + + +def get_auto_label_service() -> AutoLabelService: + """Get the auto-label service instance.""" + global _auto_label_service + if _auto_label_service is None: + _auto_label_service = AutoLabelService() + return _auto_label_service + + +def reset_auto_label_service() -> None: + """Reset the auto-label service (for testing).""" + global _auto_label_service + _auto_label_service = None diff --git a/src/web/services/batch_upload.py b/src/web/services/batch_upload.py new file mode 100644 index 0000000..db15e3f --- /dev/null +++ b/src/web/services/batch_upload.py @@ -0,0 +1,548 @@ +""" +Batch Upload Service + +Handles ZIP file uploads with multiple PDFs and optional CSV for auto-labeling. +""" + +import csv +import io +import logging +import zipfile +from datetime import datetime +from pathlib import Path +from typing import Any +from uuid import UUID + +from pydantic import BaseModel, Field, field_validator + +from src.data.admin_db import AdminDB +from src.data.admin_models import CSV_TO_CLASS_MAPPING + +logger = logging.getLogger(__name__) + +# Security limits +MAX_COMPRESSED_SIZE = 100 * 1024 * 1024 # 100 MB +MAX_UNCOMPRESSED_SIZE = 200 * 1024 * 1024 # 200 MB +MAX_INDIVIDUAL_FILE_SIZE = 50 * 1024 * 1024 # 50 MB +MAX_FILES_IN_ZIP = 1000 + + +class CSVRowData(BaseModel): + """Validated CSV row data with security checks.""" + + document_id: str = Field(..., min_length=1, max_length=255, pattern=r'^[a-zA-Z0-9\-_\.]+$') + invoice_number: str | None = Field(None, max_length=255) + invoice_date: str | None = Field(None, max_length=50) + invoice_due_date: str | None = Field(None, max_length=50) + amount: str | None = Field(None, max_length=100) + ocr: str | None = Field(None, max_length=100) + bankgiro: str | None = Field(None, max_length=50) + plusgiro: str | None = Field(None, max_length=50) + customer_number: str | None = Field(None, max_length=255) + supplier_organisation_number: str | None = Field(None, max_length=50) + + @field_validator('*', mode='before') + @classmethod + def strip_whitespace(cls, v): + """Strip whitespace from all string fields.""" + if isinstance(v, str): + return v.strip() + return v + + @field_validator('*', mode='before') + @classmethod + def reject_suspicious_patterns(cls, v): + """Reject values with suspicious characters.""" + if isinstance(v, str): + # Reject SQL/shell metacharacters and newlines + dangerous_chars = [';', '|', '&', '`', '$', '\n', '\r', '\x00'] + if any(char in v for char in dangerous_chars): + raise ValueError(f"Suspicious characters detected in value") + return v + + +class BatchUploadService: + """Service for handling batch uploads of documents via ZIP files.""" + + def __init__(self, admin_db: AdminDB): + """Initialize the batch upload service. + + Args: + admin_db: Admin database interface + """ + self.admin_db = admin_db + + def _safe_extract_filename(self, zip_path: str) -> str: + """Safely extract filename from ZIP path, preventing path traversal. + + Args: + zip_path: Path from ZIP file entry + + Returns: + Safe filename + + Raises: + ValueError: If path contains traversal attempts or is invalid + """ + # Reject absolute paths + if zip_path.startswith('/') or zip_path.startswith('\\'): + raise ValueError(f"Absolute path rejected: {zip_path}") + + # Reject path traversal attempts + if '..' in zip_path: + raise ValueError(f"Path traversal rejected: {zip_path}") + + # Reject Windows drive letters + if len(zip_path) >= 2 and zip_path[1] == ':': + raise ValueError(f"Windows path rejected: {zip_path}") + + # Get only the basename + safe_name = Path(zip_path).name + if not safe_name or safe_name in ['.', '..']: + raise ValueError(f"Invalid filename: {zip_path}") + + # Validate filename doesn't contain suspicious characters + if any(char in safe_name for char in ['\\', '/', '\x00', '\n', '\r']): + raise ValueError(f"Invalid characters in filename: {safe_name}") + + return safe_name + + def _validate_zip_safety(self, zip_file: zipfile.ZipFile) -> None: + """Validate ZIP file against Zip bomb and other attacks. + + Args: + zip_file: Opened ZIP file + + Raises: + ValueError: If ZIP file is unsafe + """ + total_uncompressed = 0 + file_count = 0 + + for zip_info in zip_file.infolist(): + file_count += 1 + + # Check file count limit + if file_count > MAX_FILES_IN_ZIP: + raise ValueError( + f"ZIP contains too many files (max {MAX_FILES_IN_ZIP})" + ) + + # Check individual file size + if zip_info.file_size > MAX_INDIVIDUAL_FILE_SIZE: + max_mb = MAX_INDIVIDUAL_FILE_SIZE / (1024 * 1024) + raise ValueError( + f"File '{zip_info.filename}' exceeds {max_mb:.0f}MB limit" + ) + + # Accumulate uncompressed size + total_uncompressed += zip_info.file_size + + # Check total uncompressed size (Zip bomb protection) + if total_uncompressed > MAX_UNCOMPRESSED_SIZE: + max_mb = MAX_UNCOMPRESSED_SIZE / (1024 * 1024) + raise ValueError( + f"Total uncompressed size exceeds {max_mb:.0f}MB limit" + ) + + # Validate filename safety + try: + self._safe_extract_filename(zip_info.filename) + except ValueError as e: + logger.warning(f"Rejecting malicious ZIP entry: {e}") + raise ValueError(f"Invalid file in ZIP: {zip_info.filename}") + + def process_zip_upload( + self, + admin_token: str, + zip_filename: str, + zip_content: bytes, + upload_source: str = "ui", + ) -> dict[str, Any]: + """Process a ZIP file containing PDFs and optional CSV. + + Args: + admin_token: Admin authentication token + zip_filename: Name of the ZIP file + zip_content: ZIP file content as bytes + upload_source: Upload source (ui or api) + + Returns: + Dictionary with batch upload results + """ + batch = self.admin_db.create_batch_upload( + admin_token=admin_token, + filename=zip_filename, + file_size=len(zip_content), + upload_source=upload_source, + ) + + try: + with zipfile.ZipFile(io.BytesIO(zip_content)) as zip_file: + # Validate ZIP safety first + self._validate_zip_safety(zip_file) + + result = self._process_zip_contents( + batch_id=batch.batch_id, + admin_token=admin_token, + zip_file=zip_file, + ) + + # Update batch upload status + self.admin_db.update_batch_upload( + batch_id=batch.batch_id, + status=result["status"], + total_files=result["total_files"], + processed_files=result["processed_files"], + successful_files=result["successful_files"], + failed_files=result["failed_files"], + csv_filename=result.get("csv_filename"), + csv_row_count=result.get("csv_row_count"), + completed_at=datetime.utcnow(), + ) + + return { + "batch_id": str(batch.batch_id), + **result, + } + + except zipfile.BadZipFile as e: + logger.error(f"Invalid ZIP file {zip_filename}: {e}") + self.admin_db.update_batch_upload( + batch_id=batch.batch_id, + status="failed", + error_message="Invalid ZIP file format", + completed_at=datetime.utcnow(), + ) + return { + "batch_id": str(batch.batch_id), + "status": "failed", + "error": "Invalid ZIP file format", + } + except ValueError as e: + # Security validation errors + logger.warning(f"ZIP validation failed for {zip_filename}: {e}") + self.admin_db.update_batch_upload( + batch_id=batch.batch_id, + status="failed", + error_message="ZIP file validation failed", + completed_at=datetime.utcnow(), + ) + return { + "batch_id": str(batch.batch_id), + "status": "failed", + "error": "ZIP file validation failed", + } + except Exception as e: + logger.error(f"Error processing ZIP file {zip_filename}: {e}", exc_info=True) + self.admin_db.update_batch_upload( + batch_id=batch.batch_id, + status="failed", + error_message="Processing error", + completed_at=datetime.utcnow(), + ) + return { + "batch_id": str(batch.batch_id), + "status": "failed", + "error": "Failed to process batch upload", + } + + def _process_zip_contents( + self, + batch_id: UUID, + admin_token: str, + zip_file: zipfile.ZipFile, + ) -> dict[str, Any]: + """Process contents of ZIP file. + + Args: + batch_id: Batch upload ID + admin_token: Admin authentication token + zip_file: Opened ZIP file + + Returns: + Processing results dictionary + """ + # Extract file lists + pdf_files = [] + csv_file = None + csv_data = {} + + for file_info in zip_file.filelist: + if file_info.is_dir(): + continue + + try: + # Use safe filename extraction + filename = self._safe_extract_filename(file_info.filename) + except ValueError as e: + logger.warning(f"Skipping invalid file: {e}") + continue + + if filename.lower().endswith('.pdf'): + pdf_files.append(file_info) + elif filename.lower().endswith('.csv'): + if csv_file is None: + csv_file = file_info + # Parse CSV + csv_data = self._parse_csv_file(zip_file, file_info) + else: + logger.warning(f"Multiple CSV files found, using first: {csv_file.filename}") + + if not pdf_files: + return { + "status": "failed", + "total_files": 0, + "processed_files": 0, + "successful_files": 0, + "failed_files": 0, + "error": "No PDF files found in ZIP", + } + + # Process each PDF file + total_files = len(pdf_files) + successful_files = 0 + failed_files = 0 + + for pdf_info in pdf_files: + file_record = None + + try: + # Use safe filename extraction + filename = self._safe_extract_filename(pdf_info.filename) + + # Create batch upload file record + file_record = self.admin_db.create_batch_upload_file( + batch_id=batch_id, + filename=filename, + status="processing", + ) + + # Get CSV data for this file if available + document_id_base = Path(filename).stem + csv_row_data = csv_data.get(document_id_base) + + # Extract PDF content + pdf_content = zip_file.read(pdf_info.filename) + + # TODO: Save PDF file and create document + # For now, just mark as completed + + self.admin_db.update_batch_upload_file( + file_id=file_record.file_id, + status="completed", + csv_row_data=csv_row_data, + processed_at=datetime.utcnow(), + ) + + successful_files += 1 + + except ValueError as e: + # Path validation error + logger.warning(f"Skipping invalid file: {e}") + if file_record: + self.admin_db.update_batch_upload_file( + file_id=file_record.file_id, + status="failed", + error_message="Invalid filename", + processed_at=datetime.utcnow(), + ) + failed_files += 1 + + except Exception as e: + logger.error(f"Error processing PDF: {e}", exc_info=True) + if file_record: + self.admin_db.update_batch_upload_file( + file_id=file_record.file_id, + status="failed", + error_message="Processing error", + processed_at=datetime.utcnow(), + ) + failed_files += 1 + + # Determine overall status + if failed_files == 0: + status = "completed" + elif successful_files == 0: + status = "failed" + else: + status = "partial" + + result = { + "status": status, + "total_files": total_files, + "processed_files": total_files, + "successful_files": successful_files, + "failed_files": failed_files, + } + + if csv_file: + result["csv_filename"] = Path(csv_file.filename).name + result["csv_row_count"] = len(csv_data) + + return result + + def _parse_csv_file( + self, + zip_file: zipfile.ZipFile, + csv_file_info: zipfile.ZipInfo, + ) -> dict[str, dict[str, Any]]: + """Parse CSV file and extract field values with validation. + + Args: + zip_file: Opened ZIP file + csv_file_info: CSV file info + + Returns: + Dictionary mapping DocumentId to validated field values + """ + # Try multiple encodings + csv_bytes = zip_file.read(csv_file_info.filename) + encodings = ['utf-8-sig', 'utf-8', 'latin-1', 'cp1252'] + csv_content = None + + for encoding in encodings: + try: + csv_content = csv_bytes.decode(encoding) + logger.info(f"CSV decoded with {encoding}") + break + except UnicodeDecodeError: + continue + + if csv_content is None: + logger.error("Failed to decode CSV with any encoding") + raise ValueError("Unable to decode CSV file") + + csv_reader = csv.DictReader(io.StringIO(csv_content)) + result = {} + + # Case-insensitive column mapping + field_name_map = { + 'DocumentId': ['DocumentId', 'documentid', 'document_id'], + 'InvoiceNumber': ['InvoiceNumber', 'invoicenumber', 'invoice_number'], + 'InvoiceDate': ['InvoiceDate', 'invoicedate', 'invoice_date'], + 'InvoiceDueDate': ['InvoiceDueDate', 'invoiceduedate', 'invoice_due_date'], + 'Amount': ['Amount', 'amount'], + 'OCR': ['OCR', 'ocr'], + 'Bankgiro': ['Bankgiro', 'bankgiro'], + 'Plusgiro': ['Plusgiro', 'plusgiro'], + 'customer_number': ['customer_number', 'customernumber', 'CustomerNumber'], + 'supplier_organisation_number': ['supplier_organisation_number', 'supplierorganisationnumber'], + } + + for row_num, row in enumerate(csv_reader, start=2): + try: + # Create case-insensitive lookup + row_lower = {k.lower(): v for k, v in row.items()} + + # Find DocumentId with case-insensitive matching + document_id = None + for variant in field_name_map['DocumentId']: + if variant.lower() in row_lower: + document_id = row_lower[variant.lower()] + break + + if not document_id: + logger.warning(f"Row {row_num}: No DocumentId found") + continue + + # Validate using Pydantic model + csv_row_dict = {'document_id': document_id} + + # Map CSV field names to model attribute names + csv_to_model_attr = { + 'InvoiceNumber': 'invoice_number', + 'InvoiceDate': 'invoice_date', + 'InvoiceDueDate': 'invoice_due_date', + 'Amount': 'amount', + 'OCR': 'ocr', + 'Bankgiro': 'bankgiro', + 'Plusgiro': 'plusgiro', + 'customer_number': 'customer_number', + 'supplier_organisation_number': 'supplier_organisation_number', + } + + for csv_field in field_name_map.keys(): + if csv_field == 'DocumentId': + continue + + model_attr = csv_to_model_attr.get(csv_field) + if not model_attr: + continue + + for variant in field_name_map[csv_field]: + if variant.lower() in row_lower and row_lower[variant.lower()]: + csv_row_dict[model_attr] = row_lower[variant.lower()] + break + + # Validate + validated_row = CSVRowData(**csv_row_dict) + + # Extract only the fields we care about (map back to CSV field names) + field_values = {} + model_attr_to_csv = { + 'invoice_number': 'InvoiceNumber', + 'invoice_date': 'InvoiceDate', + 'invoice_due_date': 'InvoiceDueDate', + 'amount': 'Amount', + 'ocr': 'OCR', + 'bankgiro': 'Bankgiro', + 'plusgiro': 'Plusgiro', + 'customer_number': 'customer_number', + 'supplier_organisation_number': 'supplier_organisation_number', + } + + for model_attr, csv_field in model_attr_to_csv.items(): + value = getattr(validated_row, model_attr, None) + if value and csv_field in CSV_TO_CLASS_MAPPING: + field_values[csv_field] = value + + if field_values: + result[document_id] = field_values + + except Exception as e: + logger.warning(f"Row {row_num}: Validation error - {e}") + continue + + return result + + def get_batch_status(self, batch_id: str) -> dict[str, Any]: + """Get batch upload status. + + Args: + batch_id: Batch upload ID + + Returns: + Batch status dictionary + """ + batch = self.admin_db.get_batch_upload(UUID(batch_id)) + if not batch: + return { + "error": "Batch upload not found", + } + + files = self.admin_db.get_batch_upload_files(batch.batch_id) + + return { + "batch_id": str(batch.batch_id), + "filename": batch.filename, + "status": batch.status, + "total_files": batch.total_files, + "processed_files": batch.processed_files, + "successful_files": batch.successful_files, + "failed_files": batch.failed_files, + "csv_filename": batch.csv_filename, + "csv_row_count": batch.csv_row_count, + "error_message": batch.error_message, + "created_at": batch.created_at.isoformat() if batch.created_at else None, + "completed_at": batch.completed_at.isoformat() if batch.completed_at else None, + "files": [ + { + "filename": f.filename, + "status": f.status, + "error_message": f.error_message, + "annotation_count": f.annotation_count, + } + for f in files + ], + } diff --git a/src/web/services/db_autolabel.py b/src/web/services/db_autolabel.py new file mode 100644 index 0000000..231c0fe --- /dev/null +++ b/src/web/services/db_autolabel.py @@ -0,0 +1,531 @@ +""" +Database-based Auto-labeling Service + +Processes documents with field values stored in the database (csv_field_values). +Used by the pre-label API to create annotations from expected values. +""" + +from __future__ import annotations + +import logging +from pathlib import Path +from typing import Any + +from src.config import DEFAULT_DPI +from src.data.admin_db import AdminDB +from src.data.admin_models import AdminDocument, CSV_TO_CLASS_MAPPING +from src.data.db import DocumentDB +from src.web.config import StorageConfig + +logger = logging.getLogger(__name__) + +# Initialize DocumentDB for saving match reports +_document_db: DocumentDB | None = None + + +def get_document_db() -> DocumentDB: + """Get or create DocumentDB instance with connection and tables initialized. + + Follows the same pattern as CLI autolabel (src/cli/autolabel.py lines 370-373). + """ + global _document_db + if _document_db is None: + _document_db = DocumentDB() + _document_db.connect() + _document_db.create_tables() # Ensure tables exist + logger.info("Connected to PostgreSQL DocumentDB for match reports") + return _document_db + + +def convert_csv_field_values_to_row_dict( + document: AdminDocument, +) -> dict[str, Any]: + """ + Convert AdminDocument.csv_field_values to row_dict format for autolabel. + + Args: + document: AdminDocument with csv_field_values + + Returns: + Dictionary in row_dict format compatible with autolabel_tasks + """ + csv_values = document.csv_field_values or {} + + # Build row_dict with DocumentId + row_dict = { + "DocumentId": str(document.document_id), + } + + # Map csv_field_values to row_dict format + # csv_field_values uses keys like: InvoiceNumber, InvoiceDate, Amount, OCR, Bankgiro, etc. + # row_dict uses same keys + for key, value in csv_values.items(): + if value is not None and value != "": + row_dict[key] = str(value) + + return row_dict + + +def get_pending_autolabel_documents( + db: AdminDB, + limit: int = 10, +) -> list[AdminDocument]: + """ + Get documents pending auto-labeling. + + Args: + db: AdminDB instance + limit: Maximum number of documents to return + + Returns: + List of AdminDocument records with status='auto_labeling' and auto_label_status='pending' + """ + from sqlmodel import select + from src.data.database import get_session_context + from src.data.admin_models import AdminDocument + + with get_session_context() as session: + statement = select(AdminDocument).where( + AdminDocument.status == "auto_labeling", + AdminDocument.auto_label_status == "pending", + ).order_by(AdminDocument.created_at).limit(limit) + + results = session.exec(statement).all() + for r in results: + session.expunge(r) + return list(results) + + +def process_document_autolabel( + document: AdminDocument, + db: AdminDB, + output_dir: Path | None = None, + dpi: int = DEFAULT_DPI, + min_confidence: float = 0.5, +) -> dict[str, Any]: + """ + Process a single document for auto-labeling using csv_field_values. + + Args: + document: AdminDocument with csv_field_values and file_path + db: AdminDB instance for updating status + output_dir: Output directory for temp files + dpi: Rendering DPI + min_confidence: Minimum match confidence + + Returns: + Result dictionary with success status and annotations + """ + from src.processing.autolabel_tasks import process_text_pdf, process_scanned_pdf + from src.pdf import PDFDocument + + document_id = str(document.document_id) + file_path = Path(document.file_path) + + if output_dir is None: + output_dir = Path("data/autolabel_output") + output_dir.mkdir(parents=True, exist_ok=True) + + # Mark as processing + db.update_document_status( + document_id=document_id, + status="auto_labeling", + auto_label_status="running", + ) + + try: + # Check if file exists + if not file_path.exists(): + raise FileNotFoundError(f"File not found: {file_path}") + + # Convert csv_field_values to row_dict + row_dict = convert_csv_field_values_to_row_dict(document) + + if len(row_dict) <= 1: # Only has DocumentId + raise ValueError("No field values to match") + + # Determine PDF type (text or scanned) + is_scanned = False + with PDFDocument(file_path) as pdf_doc: + # Check if first page has extractable text + tokens = list(pdf_doc.extract_text_tokens(0)) + is_scanned = len(tokens) < 10 # Threshold for "no text" + + # Build task data + # Use admin_upload_dir (which is PATHS['pdf_dir']) for pdf_path + # This ensures consistency with CLI autolabel for reprocess_failed.py + storage_config = StorageConfig() + pdf_path_for_report = storage_config.admin_upload_dir / f"{document_id}.pdf" + + task_data = { + "row_dict": row_dict, + "pdf_path": str(pdf_path_for_report), + "output_dir": str(output_dir), + "dpi": dpi, + "min_confidence": min_confidence, + } + + # Process based on PDF type + if is_scanned: + result = process_scanned_pdf(task_data) + else: + result = process_text_pdf(task_data) + + # Save report to DocumentDB (same as CLI autolabel) + if result.get("report"): + try: + doc_db = get_document_db() + doc_db.save_document(result["report"]) + logger.info(f"Saved match report to DocumentDB for {document_id}") + except Exception as e: + logger.warning(f"Failed to save report to DocumentDB: {e}") + + # Save annotations to AdminDB + if result.get("success") and result.get("report"): + _save_annotations_to_db( + db=db, + document_id=document_id, + report=result["report"], + page_annotations=result.get("pages", []), + dpi=dpi, + ) + + # Mark as completed + db.update_document_status( + document_id=document_id, + status="labeled", + auto_label_status="completed", + ) + else: + # Mark as failed + errors = result.get("report", {}).get("errors", ["Unknown error"]) + db.update_document_status( + document_id=document_id, + status="pending", + auto_label_status="failed", + auto_label_error="; ".join(errors) if errors else "No annotations generated", + ) + + return result + + except Exception as e: + logger.error(f"Error processing document {document_id}: {e}", exc_info=True) + + # Mark as failed + db.update_document_status( + document_id=document_id, + status="pending", + auto_label_status="failed", + auto_label_error=str(e), + ) + + return { + "doc_id": document_id, + "success": False, + "error": str(e), + } + + +def _save_annotations_to_db( + db: AdminDB, + document_id: str, + report: dict[str, Any], + page_annotations: list[dict[str, Any]], + dpi: int = 200, +) -> int: + """ + Save generated annotations to database. + + Args: + db: AdminDB instance + document_id: Document ID + report: AutoLabelReport as dict + page_annotations: List of page annotation data + dpi: DPI used for rendering images (for coordinate conversion) + + Returns: + Number of annotations saved + """ + from PIL import Image + from src.data.admin_models import FIELD_CLASS_IDS + + # Mapping from CSV field names to internal field names + CSV_TO_INTERNAL_FIELD: dict[str, str] = { + "InvoiceNumber": "invoice_number", + "InvoiceDate": "invoice_date", + "InvoiceDueDate": "invoice_due_date", + "OCR": "ocr_number", + "Bankgiro": "bankgiro", + "Plusgiro": "plusgiro", + "Amount": "amount", + "supplier_organisation_number": "supplier_organisation_number", + "customer_number": "customer_number", + "payment_line": "payment_line", + } + + # Scale factor: PDF points (72 DPI) -> pixels (at configured DPI) + scale = dpi / 72.0 + + # Cache for image dimensions per page + image_dimensions: dict[int, tuple[int, int]] = {} + + def get_image_dimensions(page_no: int) -> tuple[int, int] | None: + """Get image dimensions for a page (1-indexed).""" + if page_no in image_dimensions: + return image_dimensions[page_no] + + # Try to load from admin_images + admin_images_dir = Path("data/admin_images") / document_id + image_path = admin_images_dir / f"page_{page_no}.png" + + if image_path.exists(): + try: + with Image.open(image_path) as img: + dims = img.size # (width, height) + image_dimensions[page_no] = dims + return dims + except Exception as e: + logger.warning(f"Failed to read image dimensions from {image_path}: {e}") + + return None + + annotation_count = 0 + + # Get field results from report (list of dicts) + field_results = report.get("field_results", []) + + for field_info in field_results: + if not field_info.get("matched"): + continue + + csv_field_name = field_info.get("field_name", "") + + # Map CSV field name to internal field name + field_name = CSV_TO_INTERNAL_FIELD.get(csv_field_name, csv_field_name) + + # Get class_id from field name + class_id = FIELD_CLASS_IDS.get(field_name) + if class_id is None: + logger.warning(f"Unknown field name: {csv_field_name} -> {field_name}") + continue + + # Get bbox info (list: [x, y, x2, y2] in PDF points - 72 DPI) + bbox = field_info.get("bbox", []) + if not bbox or len(bbox) < 4: + continue + + # Convert PDF points (72 DPI) to pixel coordinates (at configured DPI) + pdf_x1, pdf_y1, pdf_x2, pdf_y2 = bbox[0], bbox[1], bbox[2], bbox[3] + x1 = pdf_x1 * scale + y1 = pdf_y1 * scale + x2 = pdf_x2 * scale + y2 = pdf_y2 * scale + + bbox_width = x2 - x1 + bbox_height = y2 - y1 + + # Get page number (convert to 1-indexed) + page_no = field_info.get("page_no", 0) + 1 + + # Get image dimensions for normalization + dims = get_image_dimensions(page_no) + if dims: + img_width, img_height = dims + # Calculate normalized coordinates + x_center = (x1 + x2) / 2 / img_width + y_center = (y1 + y2) / 2 / img_height + width = bbox_width / img_width + height = bbox_height / img_height + else: + # Fallback: use pixel coordinates as-is for normalization + # (will be slightly off but better than /1000) + logger.warning(f"Could not get image dimensions for page {page_no}, using estimates") + # Estimate A4 at configured DPI: 595 x 842 points * scale + estimated_width = 595 * scale + estimated_height = 842 * scale + x_center = (x1 + x2) / 2 / estimated_width + y_center = (y1 + y2) / 2 / estimated_height + width = bbox_width / estimated_width + height = bbox_height / estimated_height + + # Create annotation + try: + db.create_annotation( + document_id=document_id, + page_number=page_no, + class_id=class_id, + class_name=field_name, + x_center=x_center, + y_center=y_center, + width=width, + height=height, + bbox_x=int(x1), + bbox_y=int(y1), + bbox_width=int(bbox_width), + bbox_height=int(bbox_height), + text_value=field_info.get("matched_text"), + confidence=field_info.get("score"), + source="auto", + ) + annotation_count += 1 + logger.info(f"Saved annotation for {field_name}: bbox=({int(x1)}, {int(y1)}, {int(bbox_width)}, {int(bbox_height)})") + except Exception as e: + logger.warning(f"Failed to save annotation for {field_name}: {e}") + + return annotation_count + + +def run_pending_autolabel_batch( + db: AdminDB | None = None, + batch_size: int = 10, + output_dir: Path | None = None, +) -> dict[str, Any]: + """ + Process a batch of pending auto-label documents. + + Args: + db: AdminDB instance (created if None) + batch_size: Number of documents to process + output_dir: Output directory for temp files + + Returns: + Summary of processing results + """ + if db is None: + db = AdminDB() + + documents = get_pending_autolabel_documents(db, limit=batch_size) + + results = { + "total": len(documents), + "successful": 0, + "failed": 0, + "documents": [], + } + + for doc in documents: + result = process_document_autolabel( + document=doc, + db=db, + output_dir=output_dir, + ) + + doc_result = { + "document_id": str(doc.document_id), + "success": result.get("success", False), + } + + if result.get("success"): + results["successful"] += 1 + else: + results["failed"] += 1 + doc_result["error"] = result.get("error") or "Unknown error" + + results["documents"].append(doc_result) + + return results + + +def save_manual_annotations_to_document_db( + document: AdminDocument, + annotations: list, + db: AdminDB, +) -> dict[str, Any]: + """ + Save manual annotations to PostgreSQL documents and field_results tables. + + Called when user marks a document as 'labeled' from the web UI. + This ensures manually labeled documents are also tracked in the same + database as auto-labeled documents for consistency. + + Args: + document: AdminDocument instance + annotations: List of AdminAnnotation instances + db: AdminDB instance + + Returns: + Dict with success status and details + """ + from datetime import datetime + + document_id = str(document.document_id) + storage_config = StorageConfig() + + # Build pdf_path using admin_upload_dir (same as auto-label) + pdf_path = storage_config.admin_upload_dir / f"{document_id}.pdf" + + # Build report dict compatible with DocumentDB.save_document() + field_results = [] + fields_total = len(annotations) + fields_matched = 0 + + for ann in annotations: + # All manual annotations are considered "matched" since user verified them + field_result = { + "field_name": ann.class_name, + "csv_value": ann.text_value or "", # Manual annotations may not have CSV value + "matched": True, + "score": ann.confidence or 1.0, # Manual = high confidence + "matched_text": ann.text_value, + "candidate_used": "manual", + "bbox": [ann.bbox_x, ann.bbox_y, ann.bbox_x + ann.bbox_width, ann.bbox_y + ann.bbox_height], + "page_no": ann.page_number - 1, # Convert to 0-indexed + "context_keywords": [], + "error": None, + } + field_results.append(field_result) + fields_matched += 1 + + # Determine PDF type + pdf_type = "unknown" + if pdf_path.exists(): + try: + from src.pdf import PDFDocument + with PDFDocument(pdf_path) as pdf_doc: + tokens = list(pdf_doc.extract_text_tokens(0)) + pdf_type = "scanned" if len(tokens) < 10 else "text" + except Exception as e: + logger.warning(f"Could not determine PDF type: {e}") + + # Build report + report = { + "document_id": document_id, + "pdf_path": str(pdf_path), + "pdf_type": pdf_type, + "success": fields_matched > 0, + "total_pages": document.page_count, + "fields_matched": fields_matched, + "fields_total": fields_total, + "annotations_generated": fields_matched, + "processing_time_ms": 0, # Manual labeling - no processing time + "timestamp": datetime.utcnow().isoformat(), + "errors": [], + "field_results": field_results, + # Extended fields (from CSV if available) + "split": None, + "customer_number": document.csv_field_values.get("customer_number") if document.csv_field_values else None, + "supplier_name": document.csv_field_values.get("supplier_name") if document.csv_field_values else None, + "supplier_organisation_number": document.csv_field_values.get("supplier_organisation_number") if document.csv_field_values else None, + "supplier_accounts": document.csv_field_values.get("supplier_accounts") if document.csv_field_values else None, + } + + # Save to PostgreSQL DocumentDB + try: + doc_db = get_document_db() + doc_db.save_document(report) + logger.info(f"Saved manual annotations to DocumentDB for {document_id}: {fields_matched} fields") + + return { + "success": True, + "document_id": document_id, + "fields_saved": fields_matched, + "message": f"Saved {fields_matched} annotations to DocumentDB", + } + + except Exception as e: + logger.error(f"Failed to save manual annotations to DocumentDB: {e}", exc_info=True) + return { + "success": False, + "document_id": document_id, + "error": str(e), + } diff --git a/src/web/services.py b/src/web/services/inference.py similarity index 97% rename from src/web/services.py rename to src/web/services/inference.py index bd19c0e..c30a16a 100644 --- a/src/web/services.py +++ b/src/web/services/inference.py @@ -71,8 +71,8 @@ class InferenceService: start_time = time.time() try: - from ..inference.pipeline import InferencePipeline - from ..inference.yolo_detector import YOLODetector + from src.inference.pipeline import InferencePipeline + from src.inference.yolo_detector import YOLODetector # Initialize YOLO detector for visualization self._detector = YOLODetector( @@ -257,7 +257,7 @@ class InferenceService: def _save_pdf_visualization(self, pdf_path: Path, doc_id: str) -> Path: """Save visualization for PDF (first page).""" - from ..pdf.renderer import render_pdf_to_images + from src.pdf.renderer import render_pdf_to_images from ultralytics import YOLO import io diff --git a/src/web/workers/__init__.py b/src/web/workers/__init__.py new file mode 100644 index 0000000..8b8834d --- /dev/null +++ b/src/web/workers/__init__.py @@ -0,0 +1,24 @@ +""" +Background Task Queues + +Worker queues for asynchronous and batch processing. +""" + +from src.web.workers.async_queue import AsyncTaskQueue, AsyncTask +from src.web.workers.batch_queue import ( + BatchTaskQueue, + BatchTask, + init_batch_queue, + shutdown_batch_queue, + get_batch_queue, +) + +__all__ = [ + "AsyncTaskQueue", + "AsyncTask", + "BatchTaskQueue", + "BatchTask", + "init_batch_queue", + "shutdown_batch_queue", + "get_batch_queue", +] diff --git a/src/web/workers/async_queue.py b/src/web/workers/async_queue.py new file mode 100644 index 0000000..4b71180 --- /dev/null +++ b/src/web/workers/async_queue.py @@ -0,0 +1,181 @@ +""" +Async Task Queue + +Thread-safe queue for background invoice processing. +""" + +import logging +import time +from dataclasses import dataclass, field +from datetime import datetime +from pathlib import Path +from queue import Empty, Full, Queue +import threading +from threading import Event, Lock, Thread +from typing import Callable + +logger = logging.getLogger(__name__) + + +@dataclass +class AsyncTask: + """Task queued for background processing.""" + + request_id: str + api_key: str + file_path: Path + filename: str + created_at: datetime = field(default_factory=datetime.utcnow) + priority: int = 0 # Lower = higher priority (not implemented yet) + + +class AsyncTaskQueue: + """Thread-safe queue for async invoice processing.""" + + def __init__( + self, + max_size: int = 100, + worker_count: int = 1, + ) -> None: + self._queue: Queue[AsyncTask] = Queue(maxsize=max_size) + self._workers: list[Thread] = [] + self._stop_event = Event() + self._worker_count = worker_count + self._lock = Lock() + self._processing: set[str] = set() # Currently processing request_ids + self._task_handler: Callable[[AsyncTask], None] | None = None + self._started = False + + def start(self, task_handler: Callable[[AsyncTask], None]) -> None: + """Start background worker threads.""" + if self._started: + logger.warning("AsyncTaskQueue already started") + return + + self._task_handler = task_handler + self._stop_event.clear() + + for i in range(self._worker_count): + worker = Thread( + target=self._worker_loop, + name=f"async-worker-{i}", + daemon=True, + ) + worker.start() + self._workers.append(worker) + logger.info(f"Started async worker thread: {worker.name}") + + self._started = True + logger.info(f"AsyncTaskQueue started with {self._worker_count} workers") + + def stop(self, timeout: float = 30.0) -> None: + """Gracefully stop all workers.""" + if not self._started: + return + + logger.info("Stopping AsyncTaskQueue...") + self._stop_event.set() + + # Wait for workers to finish + for worker in self._workers: + worker.join(timeout=timeout / self._worker_count) + if worker.is_alive(): + logger.warning(f"Worker {worker.name} did not stop gracefully") + + self._workers.clear() + self._started = False + logger.info("AsyncTaskQueue stopped") + + def submit(self, task: AsyncTask) -> bool: + """ + Submit a task to the queue. + + Returns: + True if task was queued, False if queue is full + """ + try: + self._queue.put_nowait(task) + logger.info(f"Task {task.request_id} queued for processing") + return True + except Full: + logger.warning(f"Queue full, task {task.request_id} rejected") + return False + + def get_queue_depth(self) -> int: + """Get current number of tasks in queue.""" + return self._queue.qsize() + + def get_processing_count(self) -> int: + """Get number of tasks currently being processed.""" + with self._lock: + return len(self._processing) + + def is_processing(self, request_id: str) -> bool: + """Check if a specific request is currently being processed.""" + with self._lock: + return request_id in self._processing + + @property + def is_running(self) -> bool: + """Check if the queue is running.""" + return self._started and not self._stop_event.is_set() + + def _worker_loop(self) -> None: + """Worker loop that processes tasks from queue.""" + thread_name = threading.current_thread().name + logger.info(f"Worker {thread_name} started") + + while not self._stop_event.is_set(): + try: + # Block for up to 1 second waiting for tasks + task = self._queue.get(timeout=1.0) + except Empty: + continue + + try: + with self._lock: + self._processing.add(task.request_id) + + logger.info( + f"Worker {thread_name} processing task {task.request_id}" + ) + start_time = time.time() + + if self._task_handler: + self._task_handler(task) + + elapsed = time.time() - start_time + logger.info( + f"Worker {thread_name} completed task {task.request_id} " + f"in {elapsed:.2f}s" + ) + + except Exception as e: + logger.error( + f"Worker {thread_name} failed to process task " + f"{task.request_id}: {e}", + exc_info=True, + ) + + finally: + with self._lock: + self._processing.discard(task.request_id) + self._queue.task_done() + + logger.info(f"Worker {thread_name} stopped") + + def wait_for_completion(self, timeout: float | None = None) -> bool: + """ + Wait for all queued tasks to complete. + + Args: + timeout: Maximum time to wait in seconds + + Returns: + True if all tasks completed, False if timeout + """ + try: + self._queue.join() + return True + except Exception: + return False diff --git a/src/web/workers/batch_queue.py b/src/web/workers/batch_queue.py new file mode 100644 index 0000000..9e3ff3f --- /dev/null +++ b/src/web/workers/batch_queue.py @@ -0,0 +1,225 @@ +""" +Batch Upload Processing Queue + +Background queue for async batch upload processing. +""" + +import logging +import threading +from dataclasses import dataclass +from datetime import datetime +from queue import Queue, Full, Empty +from typing import Any +from uuid import UUID + +logger = logging.getLogger(__name__) + + +@dataclass +class BatchTask: + """Task for batch upload processing.""" + + batch_id: UUID + admin_token: str + zip_content: bytes + zip_filename: str + upload_source: str + auto_label: bool + created_at: datetime + + +class BatchTaskQueue: + """Thread-safe queue for async batch upload processing.""" + + def __init__(self, max_size: int = 20, worker_count: int = 2): + """Initialize the batch task queue. + + Args: + max_size: Maximum queue size + worker_count: Number of worker threads + """ + self._queue: Queue[BatchTask] = Queue(maxsize=max_size) + self._workers: list[threading.Thread] = [] + self._stop_event = threading.Event() + self._worker_count = worker_count + self._batch_service: Any | None = None + self._running = False + self._lock = threading.Lock() + + def start(self, batch_service: Any) -> None: + """Start worker threads with batch service. + + Args: + batch_service: BatchUploadService instance for processing + """ + with self._lock: + if self._running: + logger.warning("Batch queue already running") + return + + self._batch_service = batch_service + self._stop_event.clear() + self._running = True + + # Start worker threads + for i in range(self._worker_count): + worker = threading.Thread( + target=self._worker_loop, + name=f"BatchWorker-{i}", + daemon=True, + ) + worker.start() + self._workers.append(worker) + + logger.info(f"Started {self._worker_count} batch workers") + + def stop(self, timeout: float = 30.0) -> None: + """Stop all worker threads gracefully. + + Args: + timeout: Maximum time to wait for workers to finish + """ + with self._lock: + if not self._running: + return + + logger.info("Stopping batch queue...") + self._stop_event.set() + self._running = False + + # Wait for workers to finish + for worker in self._workers: + worker.join(timeout=timeout) + + self._workers.clear() + logger.info("Batch queue stopped") + + def submit(self, task: BatchTask) -> bool: + """Submit a batch task to the queue. + + Args: + task: Batch task to process + + Returns: + True if task was queued, False if queue is full + """ + try: + self._queue.put(task, block=False) + logger.info(f"Queued batch task: batch_id={task.batch_id}") + return True + except Full: + logger.warning(f"Queue full, rejected task: batch_id={task.batch_id}") + return False + + def get_queue_depth(self) -> int: + """Get the number of pending tasks in queue. + + Returns: + Number of tasks waiting to be processed + """ + return self._queue.qsize() + + @property + def is_running(self) -> bool: + """Check if queue is running. + + Returns: + True if queue is active + """ + return self._running + + def _worker_loop(self) -> None: + """Worker thread main loop.""" + worker_name = threading.current_thread().name + logger.info(f"{worker_name} started") + + while not self._stop_event.is_set(): + try: + # Get task with timeout to check stop event periodically + task = self._queue.get(timeout=1.0) + self._process_task(task) + self._queue.task_done() + except Empty: + # No tasks, continue loop to check stop event + continue + except Exception as e: + logger.error(f"{worker_name} error processing task: {e}", exc_info=True) + + logger.info(f"{worker_name} stopped") + + def _process_task(self, task: BatchTask) -> None: + """Process a single batch task. + + Args: + task: Batch task to process + """ + if self._batch_service is None: + logger.error("Batch service not initialized, cannot process task") + return + + logger.info( + f"Processing batch task: batch_id={task.batch_id}, " + f"filename={task.zip_filename}" + ) + + try: + # Process the batch upload using the service + result = self._batch_service.process_zip_upload( + admin_token=task.admin_token, + zip_filename=task.zip_filename, + zip_content=task.zip_content, + upload_source=task.upload_source, + ) + + logger.info( + f"Batch task completed: batch_id={task.batch_id}, " + f"status={result.get('status')}, " + f"successful_files={result.get('successful_files')}, " + f"failed_files={result.get('failed_files')}" + ) + + except Exception as e: + logger.error( + f"Error processing batch task {task.batch_id}: {e}", + exc_info=True, + ) + + +# Global batch queue instance +_batch_queue: BatchTaskQueue | None = None +_queue_lock = threading.Lock() + + +def get_batch_queue() -> BatchTaskQueue: + """Get or create the global batch queue instance. + + Returns: + Batch task queue instance + """ + global _batch_queue + + if _batch_queue is None: + with _queue_lock: + if _batch_queue is None: + _batch_queue = BatchTaskQueue(max_size=20, worker_count=2) + + return _batch_queue + + +def init_batch_queue(batch_service: Any) -> None: + """Initialize and start the batch queue. + + Args: + batch_service: BatchUploadService instance + """ + queue = get_batch_queue() + if not queue.is_running: + queue.start(batch_service) + + +def shutdown_batch_queue() -> None: + """Shutdown the batch queue gracefully.""" + global _batch_queue + + if _batch_queue is not None: + _batch_queue.stop() diff --git a/src/yolo/db_dataset.py b/src/yolo/db_dataset.py index 537209e..4aafbae 100644 --- a/src/yolo/db_dataset.py +++ b/src/yolo/db_dataset.py @@ -17,6 +17,7 @@ from typing import Any, Optional import numpy as np from PIL import Image +from src.config import DEFAULT_DPI from .annotation_generator import FIELD_CLASSES, YOLOAnnotation logger = logging.getLogger(__name__) @@ -74,7 +75,7 @@ class DBYOLODataset: train_ratio: float = 0.8, val_ratio: float = 0.1, seed: int = 42, - dpi: int = 150, # Must match the DPI used in autolabel_tasks.py for rendering + dpi: int = DEFAULT_DPI, # Must match the DPI used in autolabel_tasks.py for rendering min_confidence: float = 0.7, bbox_padding_px: int = 20, min_bbox_height_px: int = 30, diff --git a/tests/data/test_admin_models_v2.py b/tests/data/test_admin_models_v2.py new file mode 100644 index 0000000..4d65205 --- /dev/null +++ b/tests/data/test_admin_models_v2.py @@ -0,0 +1,524 @@ +""" +Tests for Admin Models v2 - Batch Upload and Training Links. + +Tests for new SQLModel classes: BatchUpload, BatchUploadFile, +TrainingDocumentLink, AnnotationHistory. +""" + +import pytest +from datetime import datetime +from uuid import UUID, uuid4 + +from src.data.admin_models import ( + BatchUpload, + BatchUploadFile, + TrainingDocumentLink, + AnnotationHistory, + AdminDocument, + AdminAnnotation, + TrainingTask, + FIELD_CLASSES, + CSV_TO_CLASS_MAPPING, +) + + +class TestBatchUpload: + """Tests for BatchUpload model.""" + + def test_batch_upload_creation(self): + """Test basic batch upload creation.""" + batch = BatchUpload( + admin_token="test-token", + filename="invoices.zip", + file_size=1024000, + upload_source="ui", + ) + + assert batch.batch_id is not None + assert isinstance(batch.batch_id, UUID) + assert batch.admin_token == "test-token" + assert batch.filename == "invoices.zip" + assert batch.file_size == 1024000 + assert batch.upload_source == "ui" + assert batch.status == "processing" + assert batch.total_files == 0 + assert batch.processed_files == 0 + assert batch.successful_files == 0 + assert batch.failed_files == 0 + assert batch.error_message is None + assert batch.completed_at is None + + def test_batch_upload_api_source(self): + """Test batch upload with API source.""" + batch = BatchUpload( + admin_token="api-token", + filename="batch.zip", + file_size=2048000, + upload_source="api", + ) + + assert batch.upload_source == "api" + + def test_batch_upload_with_progress(self): + """Test batch upload with progress tracking.""" + batch = BatchUpload( + admin_token="test-token", + filename="large_batch.zip", + file_size=10240000, + total_files=100, + processed_files=50, + successful_files=48, + failed_files=2, + status="processing", + ) + + assert batch.total_files == 100 + assert batch.processed_files == 50 + assert batch.successful_files == 48 + assert batch.failed_files == 2 + + def test_batch_upload_completed(self): + """Test completed batch upload.""" + now = datetime.utcnow() + batch = BatchUpload( + admin_token="test-token", + filename="batch.zip", + file_size=1024000, + status="completed", + total_files=10, + processed_files=10, + successful_files=10, + failed_files=0, + completed_at=now, + ) + + assert batch.status == "completed" + assert batch.completed_at == now + + def test_batch_upload_failed(self): + """Test failed batch upload.""" + batch = BatchUpload( + admin_token="test-token", + filename="bad.zip", + file_size=1024, + status="failed", + error_message="Invalid ZIP file format", + ) + + assert batch.status == "failed" + assert batch.error_message == "Invalid ZIP file format" + + def test_batch_upload_partial(self): + """Test partial batch upload with some failures.""" + batch = BatchUpload( + admin_token="test-token", + filename="mixed.zip", + file_size=5120000, + status="partial", + total_files=20, + processed_files=20, + successful_files=15, + failed_files=5, + ) + + assert batch.status == "partial" + assert batch.failed_files == 5 + + +class TestBatchUploadFile: + """Tests for BatchUploadFile model.""" + + def test_batch_upload_file_creation(self): + """Test basic file record creation.""" + batch_id = uuid4() + file_record = BatchUploadFile( + batch_id=batch_id, + filename="INV001.pdf", + ) + + assert file_record.file_id is not None + assert isinstance(file_record.file_id, UUID) + assert file_record.batch_id == batch_id + assert file_record.filename == "INV001.pdf" + assert file_record.status == "pending" + assert file_record.document_id is None + assert file_record.error_message is None + assert file_record.csv_row_data is None + assert file_record.processed_at is None + + def test_batch_upload_file_with_document(self): + """Test file record linked to document.""" + batch_id = uuid4() + document_id = uuid4() + file_record = BatchUploadFile( + batch_id=batch_id, + document_id=document_id, + filename="INV002.pdf", + status="completed", + ) + + assert file_record.document_id == document_id + assert file_record.status == "completed" + + def test_batch_upload_file_with_csv_data(self): + """Test file record with CSV row data.""" + batch_id = uuid4() + csv_data = { + "DocumentId": "INV003", + "InvoiceNumber": "F2024-003", + "Amount": "1500.00", + "OCR": "7350012345678", + } + file_record = BatchUploadFile( + batch_id=batch_id, + filename="INV003.pdf", + csv_row_data=csv_data, + ) + + assert file_record.csv_row_data == csv_data + assert file_record.csv_row_data["InvoiceNumber"] == "F2024-003" + + def test_batch_upload_file_failed(self): + """Test failed file record.""" + batch_id = uuid4() + file_record = BatchUploadFile( + batch_id=batch_id, + filename="corrupted.pdf", + status="failed", + error_message="Corrupted PDF file", + ) + + assert file_record.status == "failed" + assert file_record.error_message == "Corrupted PDF file" + + def test_batch_upload_file_skipped(self): + """Test skipped file record.""" + batch_id = uuid4() + file_record = BatchUploadFile( + batch_id=batch_id, + filename="not_a_pdf.txt", + status="skipped", + error_message="Not a PDF file", + ) + + assert file_record.status == "skipped" + + +class TestTrainingDocumentLink: + """Tests for TrainingDocumentLink model.""" + + def test_training_document_link_creation(self): + """Test basic link creation.""" + task_id = uuid4() + document_id = uuid4() + link = TrainingDocumentLink( + task_id=task_id, + document_id=document_id, + ) + + assert link.link_id is not None + assert isinstance(link.link_id, UUID) + assert link.task_id == task_id + assert link.document_id == document_id + assert link.annotation_snapshot is None + + def test_training_document_link_with_snapshot(self): + """Test link with annotation snapshot.""" + task_id = uuid4() + document_id = uuid4() + snapshot = { + "annotations": [ + { + "class_id": 0, + "class_name": "invoice_number", + "text_value": "F2024-001", + "x_center": 0.5, + "y_center": 0.3, + }, + { + "class_id": 6, + "class_name": "amount", + "text_value": "1500.00", + "x_center": 0.7, + "y_center": 0.6, + }, + ], + "total_count": 2, + "snapshot_time": "2024-01-20T15:00:00", + } + link = TrainingDocumentLink( + task_id=task_id, + document_id=document_id, + annotation_snapshot=snapshot, + ) + + assert link.annotation_snapshot == snapshot + assert len(link.annotation_snapshot["annotations"]) == 2 + + +class TestAnnotationHistory: + """Tests for AnnotationHistory model.""" + + def test_annotation_history_created(self): + """Test history record for creation.""" + annotation_id = uuid4() + new_value = { + "class_id": 0, + "class_name": "invoice_number", + "text_value": "F2024-001", + "bbox_x": 100, + "bbox_y": 200, + "bbox_width": 150, + "bbox_height": 30, + "source": "manual", + } + history = AnnotationHistory( + annotation_id=annotation_id, + action="created", + new_value=new_value, + changed_by="admin-token-123", + ) + + assert history.history_id is not None + assert history.annotation_id == annotation_id + assert history.action == "created" + assert history.previous_value is None + assert history.new_value == new_value + assert history.changed_by == "admin-token-123" + + def test_annotation_history_updated(self): + """Test history record for update.""" + annotation_id = uuid4() + previous_value = { + "text_value": "F2024-001", + "bbox_x": 100, + } + new_value = { + "text_value": "F2024-001-A", + "bbox_x": 110, + } + history = AnnotationHistory( + annotation_id=annotation_id, + action="updated", + previous_value=previous_value, + new_value=new_value, + changed_by="admin-token-123", + change_reason="Corrected OCR error", + ) + + assert history.action == "updated" + assert history.previous_value == previous_value + assert history.new_value == new_value + assert history.change_reason == "Corrected OCR error" + + def test_annotation_history_override(self): + """Test history record for override.""" + annotation_id = uuid4() + previous_value = { + "text_value": "F2024-001", + "source": "auto", + "confidence": 0.85, + } + new_value = { + "text_value": "F2024-001-CORRECTED", + "source": "manual", + "confidence": None, + } + history = AnnotationHistory( + annotation_id=annotation_id, + action="override", + previous_value=previous_value, + new_value=new_value, + changed_by="admin-token-123", + change_reason="Manual correction of auto-label", + ) + + assert history.action == "override" + + def test_annotation_history_deleted(self): + """Test history record for deletion.""" + annotation_id = uuid4() + previous_value = { + "class_id": 6, + "class_name": "amount", + "text_value": "1500.00", + } + history = AnnotationHistory( + annotation_id=annotation_id, + action="deleted", + previous_value=previous_value, + changed_by="admin-token-123", + change_reason="Incorrect annotation", + ) + + assert history.action == "deleted" + assert history.new_value is None + + +class TestAdminDocumentExtensions: + """Tests for AdminDocument extensions.""" + + def test_document_with_upload_source(self): + """Test document with upload source field.""" + doc = AdminDocument( + admin_token="test-token", + filename="invoice.pdf", + file_size=1024, + content_type="application/pdf", + file_path="/tmp/invoice.pdf", + upload_source="api", + ) + + assert doc.upload_source == "api" + + def test_document_with_batch_id(self): + """Test document linked to batch upload.""" + batch_id = uuid4() + doc = AdminDocument( + admin_token="test-token", + filename="invoice.pdf", + file_size=1024, + content_type="application/pdf", + file_path="/tmp/invoice.pdf", + batch_id=batch_id, + ) + + assert doc.batch_id == batch_id + + def test_document_with_csv_field_values(self): + """Test document with CSV field values.""" + csv_values = { + "InvoiceNumber": "F2024-001", + "Amount": "1500.00", + "OCR": "7350012345678", + } + doc = AdminDocument( + admin_token="test-token", + filename="invoice.pdf", + file_size=1024, + content_type="application/pdf", + file_path="/tmp/invoice.pdf", + csv_field_values=csv_values, + ) + + assert doc.csv_field_values == csv_values + + def test_document_with_annotation_lock(self): + """Test document with annotation lock.""" + lock_until = datetime.utcnow() + doc = AdminDocument( + admin_token="test-token", + filename="invoice.pdf", + file_size=1024, + content_type="application/pdf", + file_path="/tmp/invoice.pdf", + annotation_lock_until=lock_until, + ) + + assert doc.annotation_lock_until == lock_until + + +class TestAdminAnnotationExtensions: + """Tests for AdminAnnotation extensions.""" + + def test_annotation_with_verification(self): + """Test annotation with verification fields.""" + now = datetime.utcnow() + ann = AdminAnnotation( + document_id=uuid4(), + class_id=0, + class_name="invoice_number", + x_center=0.5, + y_center=0.3, + width=0.2, + height=0.05, + bbox_x=100, + bbox_y=200, + bbox_width=150, + bbox_height=30, + is_verified=True, + verified_at=now, + verified_by="admin-token-123", + ) + + assert ann.is_verified is True + assert ann.verified_at == now + assert ann.verified_by == "admin-token-123" + + def test_annotation_with_override_info(self): + """Test annotation with override information.""" + original_id = uuid4() + ann = AdminAnnotation( + document_id=uuid4(), + class_id=0, + class_name="invoice_number", + x_center=0.5, + y_center=0.3, + width=0.2, + height=0.05, + bbox_x=100, + bbox_y=200, + bbox_width=150, + bbox_height=30, + source="manual", + override_source="auto", + original_annotation_id=original_id, + ) + + assert ann.override_source == "auto" + assert ann.original_annotation_id == original_id + + +class TestTrainingTaskExtensions: + """Tests for TrainingTask extensions.""" + + def test_training_task_with_document_count(self): + """Test training task with document count.""" + task = TrainingTask( + admin_token="test-token", + name="Training Run 2024-01", + document_count=500, + ) + + assert task.document_count == 500 + + def test_training_task_with_metrics(self): + """Test training task with extracted metrics.""" + task = TrainingTask( + admin_token="test-token", + name="Training Run 2024-01", + status="completed", + metrics_mAP=0.935, + metrics_precision=0.92, + metrics_recall=0.88, + ) + + assert task.metrics_mAP == 0.935 + assert task.metrics_precision == 0.92 + assert task.metrics_recall == 0.88 + + +class TestCSVToClassMapping: + """Tests for CSV column to class ID mapping.""" + + def test_csv_mapping_exists(self): + """Test that CSV mapping is defined.""" + assert CSV_TO_CLASS_MAPPING is not None + assert len(CSV_TO_CLASS_MAPPING) > 0 + + def test_csv_mapping_values(self): + """Test specific CSV column mappings.""" + assert CSV_TO_CLASS_MAPPING["InvoiceNumber"] == 0 + assert CSV_TO_CLASS_MAPPING["InvoiceDate"] == 1 + assert CSV_TO_CLASS_MAPPING["InvoiceDueDate"] == 2 + assert CSV_TO_CLASS_MAPPING["OCR"] == 3 + assert CSV_TO_CLASS_MAPPING["Bankgiro"] == 4 + assert CSV_TO_CLASS_MAPPING["Plusgiro"] == 5 + assert CSV_TO_CLASS_MAPPING["Amount"] == 6 + assert CSV_TO_CLASS_MAPPING["supplier_organisation_number"] == 7 + assert CSV_TO_CLASS_MAPPING["customer_number"] == 9 + + def test_csv_mapping_matches_field_classes(self): + """Test that CSV mapping is consistent with FIELD_CLASSES.""" + for csv_name, class_id in CSV_TO_CLASS_MAPPING.items(): + assert class_id in FIELD_CLASSES diff --git a/tests/test_config.py b/tests/test_config.py index c76e09c..bc3c9b5 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -18,7 +18,7 @@ class TestDatabaseConfig: def test_config_loads_from_env(self): """Test that config loads successfully from .env file.""" # Import config (should load .env automatically) - import config + from src import config # Verify database config is loaded assert config.DATABASE is not None @@ -30,7 +30,7 @@ class TestDatabaseConfig: def test_database_password_loaded(self): """Test that database password is loaded from environment.""" - import config + from src import config # Password should be loaded from .env assert config.DATABASE['password'] is not None @@ -38,7 +38,7 @@ class TestDatabaseConfig: def test_database_connection_string(self): """Test database connection string generation.""" - import config + from src import config conn_str = config.get_db_connection_string() @@ -71,7 +71,7 @@ class TestPathsConfig: def test_paths_config_exists(self): """Test that PATHS configuration exists.""" - import config + from src import config assert config.PATHS is not None assert 'csv_dir' in config.PATHS @@ -85,7 +85,7 @@ class TestAutolabelConfig: def test_autolabel_config_exists(self): """Test that AUTOLABEL configuration exists.""" - import config + from src import config assert config.AUTOLABEL is not None assert 'workers' in config.AUTOLABEL @@ -95,7 +95,7 @@ class TestAutolabelConfig: def test_autolabel_ratios_sum_to_one(self): """Test that train/val/test ratios sum to 1.0.""" - import config + from src import config total = ( config.AUTOLABEL['train_ratio'] + diff --git a/tests/web/__init__.py b/tests/web/__init__.py new file mode 100644 index 0000000..9feb713 --- /dev/null +++ b/tests/web/__init__.py @@ -0,0 +1 @@ +"""Tests for web API components.""" diff --git a/tests/web/conftest.py b/tests/web/conftest.py new file mode 100644 index 0000000..c9a0fa5 --- /dev/null +++ b/tests/web/conftest.py @@ -0,0 +1,132 @@ +""" +Test fixtures for web API tests. +""" + +import tempfile +from datetime import datetime, timedelta +from pathlib import Path +from unittest.mock import MagicMock, patch +from uuid import UUID + +import pytest + +from src.data.async_request_db import ApiKeyConfig, AsyncRequestDB +from src.data.models import AsyncRequest +from src.web.workers.async_queue import AsyncTask, AsyncTaskQueue +from src.web.services.async_processing import AsyncProcessingService +from src.web.config import AsyncConfig, StorageConfig +from src.web.core.rate_limiter import RateLimiter + + +@pytest.fixture +def mock_db(): + """Create a mock AsyncRequestDB.""" + db = MagicMock(spec=AsyncRequestDB) + + # Default return values + db.is_valid_api_key.return_value = True + db.get_api_key_config.return_value = ApiKeyConfig( + api_key="test-api-key", + name="Test Key", + is_active=True, + requests_per_minute=10, + max_concurrent_jobs=3, + max_file_size_mb=50, + ) + db.count_active_jobs.return_value = 0 + db.get_queue_position.return_value = 1 + + return db + + +@pytest.fixture +def rate_limiter(mock_db): + """Create a RateLimiter with mock database.""" + return RateLimiter(mock_db) + + +@pytest.fixture +def task_queue(): + """Create an AsyncTaskQueue.""" + return AsyncTaskQueue(max_size=10, worker_count=1) + + +@pytest.fixture +def async_config(): + """Create an AsyncConfig for testing.""" + with tempfile.TemporaryDirectory() as tmpdir: + yield AsyncConfig( + queue_max_size=10, + worker_count=1, + task_timeout_seconds=30, + result_retention_days=7, + temp_upload_dir=Path(tmpdir) / "async", + max_file_size_mb=10, + ) + + +@pytest.fixture +def storage_config(): + """Create a StorageConfig for testing.""" + with tempfile.TemporaryDirectory() as tmpdir: + yield StorageConfig( + upload_dir=Path(tmpdir) / "uploads", + result_dir=Path(tmpdir) / "results", + max_file_size_mb=50, + ) + + +@pytest.fixture +def mock_inference_service(): + """Create a mock InferenceService.""" + service = MagicMock() + service.is_initialized = True + service.gpu_available = False + + # Mock process_pdf to return a successful result + mock_result = MagicMock() + mock_result.document_id = "test-doc" + mock_result.success = True + mock_result.document_type = "invoice" + mock_result.fields = {"InvoiceNumber": "12345", "Amount": "1000.00"} + mock_result.confidence = {"InvoiceNumber": 0.95, "Amount": 0.92} + mock_result.detections = [] + mock_result.errors = [] + mock_result.visualization_path = None + + service.process_pdf.return_value = mock_result + service.process_image.return_value = mock_result + + return service + + +# Valid UUID for testing +TEST_REQUEST_UUID = "550e8400-e29b-41d4-a716-446655440000" + + +@pytest.fixture +def sample_async_request(): + """Create a sample AsyncRequest.""" + return AsyncRequest( + request_id=UUID(TEST_REQUEST_UUID), + api_key="test-api-key", + status="pending", + filename="test.pdf", + file_size=1024, + content_type="application/pdf", + expires_at=datetime.utcnow() + timedelta(days=7), + ) + + +@pytest.fixture +def sample_task(): + """Create a sample AsyncTask.""" + with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f: + f.write(b"fake pdf content") + return AsyncTask( + request_id=TEST_REQUEST_UUID, + api_key="test-api-key", + file_path=Path(f.name), + filename="test.pdf", + created_at=datetime.utcnow(), + ) diff --git a/tests/web/test_admin_annotations.py b/tests/web/test_admin_annotations.py new file mode 100644 index 0000000..2396fb2 --- /dev/null +++ b/tests/web/test_admin_annotations.py @@ -0,0 +1,197 @@ +""" +Tests for Admin Annotation Routes. +""" + +import pytest +from datetime import datetime +from unittest.mock import MagicMock, patch +from uuid import UUID + +from fastapi import HTTPException + +from src.data.admin_models import AdminAnnotation, AdminDocument, FIELD_CLASSES +from src.web.api.v1.admin.annotations import _validate_uuid, create_annotation_router +from src.web.schemas.admin import ( + AnnotationCreate, + AnnotationUpdate, + AutoLabelRequest, + BoundingBox, +) + + +# Test UUIDs +TEST_DOC_UUID = "550e8400-e29b-41d4-a716-446655440000" +TEST_ANN_UUID = "660e8400-e29b-41d4-a716-446655440001" +TEST_TOKEN = "test-admin-token-12345" + + +class TestAnnotationRouterCreation: + """Tests for annotation router creation.""" + + def test_creates_router_with_endpoints(self): + """Test router is created with expected endpoints.""" + router = create_annotation_router() + + # Get route paths (includes prefix) + paths = [route.path for route in router.routes] + + # Paths include the /admin/documents prefix + assert any("{document_id}/annotations" in p for p in paths) + assert any("{annotation_id}" in p for p in paths) + assert any("auto-label" in p for p in paths) + assert any("images" in p for p in paths) + + +class TestAnnotationCreateSchema: + """Tests for AnnotationCreate schema.""" + + def test_valid_annotation(self): + """Test valid annotation creation.""" + ann = AnnotationCreate( + page_number=1, + class_id=0, + bbox=BoundingBox(x=100, y=100, width=200, height=50), + text_value="12345", + ) + + assert ann.page_number == 1 + assert ann.class_id == 0 + assert ann.bbox.x == 100 + assert ann.text_value == "12345" + + def test_class_id_range(self): + """Test class_id must be 0-9.""" + # Valid class IDs + for class_id in range(10): + ann = AnnotationCreate( + page_number=1, + class_id=class_id, + bbox=BoundingBox(x=0, y=0, width=100, height=50), + ) + assert ann.class_id == class_id + + def test_bbox_validation(self): + """Test bounding box validation.""" + bbox = BoundingBox(x=0, y=0, width=100, height=50) + assert bbox.width >= 1 + assert bbox.height >= 1 + + +class TestAnnotationUpdateSchema: + """Tests for AnnotationUpdate schema.""" + + def test_partial_update(self): + """Test partial update with only some fields.""" + update = AnnotationUpdate( + text_value="new value", + ) + + assert update.text_value == "new value" + assert update.class_id is None + assert update.bbox is None + + def test_bbox_update(self): + """Test bounding box update.""" + update = AnnotationUpdate( + bbox=BoundingBox(x=50, y=50, width=150, height=75), + ) + + assert update.bbox.x == 50 + assert update.bbox.width == 150 + + +class TestAutoLabelRequestSchema: + """Tests for AutoLabelRequest schema.""" + + def test_valid_request(self): + """Test valid auto-label request.""" + request = AutoLabelRequest( + field_values={ + "InvoiceNumber": "12345", + "Amount": "1000.00", + }, + replace_existing=True, + ) + + assert len(request.field_values) == 2 + assert request.field_values["InvoiceNumber"] == "12345" + assert request.replace_existing is True + + def test_requires_field_values(self): + """Test that field_values is required.""" + with pytest.raises(Exception): + AutoLabelRequest(replace_existing=True) + + +class TestFieldClasses: + """Tests for field class mapping.""" + + def test_all_classes_defined(self): + """Test all 10 field classes are defined.""" + assert len(FIELD_CLASSES) == 10 + + def test_class_ids_sequential(self): + """Test class IDs are 0-9.""" + assert set(FIELD_CLASSES.keys()) == set(range(10)) + + def test_known_field_names(self): + """Test known field names are present.""" + names = list(FIELD_CLASSES.values()) + + assert "invoice_number" in names + assert "invoice_date" in names + assert "amount" in names + assert "bankgiro" in names + assert "ocr_number" in names + + +class TestAnnotationModel: + """Tests for AdminAnnotation model.""" + + def test_annotation_creation(self): + """Test annotation model creation.""" + ann = AdminAnnotation( + document_id=UUID(TEST_DOC_UUID), + page_number=1, + class_id=0, + class_name="invoice_number", + x_center=0.5, + y_center=0.5, + width=0.2, + height=0.05, + bbox_x=100, + bbox_y=100, + bbox_width=200, + bbox_height=50, + text_value="12345", + confidence=0.95, + source="manual", + ) + + assert str(ann.document_id) == TEST_DOC_UUID + assert ann.class_id == 0 + assert ann.x_center == 0.5 + assert ann.source == "manual" + + def test_normalized_coordinates(self): + """Test normalized coordinates are 0-1 range.""" + # Valid normalized coords + ann = AdminAnnotation( + document_id=UUID(TEST_DOC_UUID), + page_number=1, + class_id=0, + class_name="test", + x_center=0.5, + y_center=0.5, + width=0.2, + height=0.05, + bbox_x=0, + bbox_y=0, + bbox_width=100, + bbox_height=50, + ) + + assert 0 <= ann.x_center <= 1 + assert 0 <= ann.y_center <= 1 + assert 0 <= ann.width <= 1 + assert 0 <= ann.height <= 1 diff --git a/tests/web/test_admin_auth.py b/tests/web/test_admin_auth.py new file mode 100644 index 0000000..2e12b02 --- /dev/null +++ b/tests/web/test_admin_auth.py @@ -0,0 +1,162 @@ +""" +Tests for Admin Authentication. +""" + +import pytest +from datetime import datetime, timedelta +from unittest.mock import MagicMock, patch + +from fastapi import HTTPException + +from src.data.admin_db import AdminDB +from src.data.admin_models import AdminToken +from src.web.core.auth import ( + get_admin_db, + reset_admin_db, + validate_admin_token, +) + + +@pytest.fixture +def mock_admin_db(): + """Create a mock AdminDB.""" + db = MagicMock(spec=AdminDB) + db.is_valid_admin_token.return_value = True + return db + + +@pytest.fixture(autouse=True) +def reset_db(): + """Reset admin DB after each test.""" + yield + reset_admin_db() + + +class TestValidateAdminToken: + """Tests for validate_admin_token dependency.""" + + def test_missing_token_raises_401(self, mock_admin_db): + """Test that missing token raises 401.""" + import asyncio + + with pytest.raises(HTTPException) as exc_info: + asyncio.get_event_loop().run_until_complete( + validate_admin_token(None, mock_admin_db) + ) + + assert exc_info.value.status_code == 401 + assert "Admin token required" in exc_info.value.detail + + def test_invalid_token_raises_401(self, mock_admin_db): + """Test that invalid token raises 401.""" + import asyncio + + mock_admin_db.is_valid_admin_token.return_value = False + + with pytest.raises(HTTPException) as exc_info: + asyncio.get_event_loop().run_until_complete( + validate_admin_token("invalid-token", mock_admin_db) + ) + + assert exc_info.value.status_code == 401 + assert "Invalid or expired" in exc_info.value.detail + + def test_valid_token_returns_token(self, mock_admin_db): + """Test that valid token is returned.""" + import asyncio + + token = "valid-test-token" + mock_admin_db.is_valid_admin_token.return_value = True + + result = asyncio.get_event_loop().run_until_complete( + validate_admin_token(token, mock_admin_db) + ) + + assert result == token + mock_admin_db.update_admin_token_usage.assert_called_once_with(token) + + +class TestAdminDB: + """Tests for AdminDB operations.""" + + def test_is_valid_admin_token_active(self): + """Test valid active token.""" + with patch("src.data.admin_db.get_session_context") as mock_ctx: + mock_session = MagicMock() + mock_ctx.return_value.__enter__.return_value = mock_session + + mock_token = AdminToken( + token="test-token", + name="Test", + is_active=True, + expires_at=None, + ) + mock_session.get.return_value = mock_token + + db = AdminDB() + assert db.is_valid_admin_token("test-token") is True + + def test_is_valid_admin_token_inactive(self): + """Test inactive token.""" + with patch("src.data.admin_db.get_session_context") as mock_ctx: + mock_session = MagicMock() + mock_ctx.return_value.__enter__.return_value = mock_session + + mock_token = AdminToken( + token="test-token", + name="Test", + is_active=False, + expires_at=None, + ) + mock_session.get.return_value = mock_token + + db = AdminDB() + assert db.is_valid_admin_token("test-token") is False + + def test_is_valid_admin_token_expired(self): + """Test expired token.""" + with patch("src.data.admin_db.get_session_context") as mock_ctx: + mock_session = MagicMock() + mock_ctx.return_value.__enter__.return_value = mock_session + + mock_token = AdminToken( + token="test-token", + name="Test", + is_active=True, + expires_at=datetime.utcnow() - timedelta(days=1), + ) + mock_session.get.return_value = mock_token + + db = AdminDB() + assert db.is_valid_admin_token("test-token") is False + + def test_is_valid_admin_token_not_found(self): + """Test token not found.""" + with patch("src.data.admin_db.get_session_context") as mock_ctx: + mock_session = MagicMock() + mock_ctx.return_value.__enter__.return_value = mock_session + mock_session.get.return_value = None + + db = AdminDB() + assert db.is_valid_admin_token("nonexistent") is False + + +class TestGetAdminDb: + """Tests for get_admin_db function.""" + + def test_returns_singleton(self): + """Test that get_admin_db returns singleton.""" + reset_admin_db() + + db1 = get_admin_db() + db2 = get_admin_db() + + assert db1 is db2 + + def test_reset_clears_singleton(self): + """Test that reset clears singleton.""" + db1 = get_admin_db() + reset_admin_db() + db2 = get_admin_db() + + assert db1 is not db2 diff --git a/tests/web/test_admin_routes.py b/tests/web/test_admin_routes.py new file mode 100644 index 0000000..070ea6e --- /dev/null +++ b/tests/web/test_admin_routes.py @@ -0,0 +1,164 @@ +""" +Tests for Admin Document Routes. +""" + +import pytest +from datetime import datetime +from io import BytesIO +from pathlib import Path +from unittest.mock import MagicMock, patch +from uuid import UUID + +from fastapi import HTTPException +from fastapi.testclient import TestClient + +from src.data.admin_models import AdminDocument, AdminToken +from src.web.api.v1.admin.documents import _validate_uuid, create_admin_router + + +# Test UUID +TEST_DOC_UUID = "550e8400-e29b-41d4-a716-446655440000" +TEST_TOKEN = "test-admin-token-12345" + + +class TestValidateUUID: + """Tests for UUID validation.""" + + def test_valid_uuid(self): + """Test valid UUID passes validation.""" + _validate_uuid(TEST_DOC_UUID, "test") # Should not raise + + def test_invalid_uuid_raises_400(self): + """Test invalid UUID raises 400.""" + with pytest.raises(HTTPException) as exc_info: + _validate_uuid("not-a-uuid", "document_id") + + assert exc_info.value.status_code == 400 + assert "Invalid document_id format" in exc_info.value.detail + + +class TestAdminRouter: + """Tests for admin router creation.""" + + def test_creates_router_with_endpoints(self): + """Test router is created with expected endpoints.""" + router = create_admin_router((".pdf", ".png", ".jpg")) + + # Get route paths (include prefix from router) + paths = [route.path for route in router.routes] + + # Paths include the /admin prefix + assert any("/auth/token" in p for p in paths) + assert any("/documents" in p for p in paths) + assert any("/documents/stats" in p for p in paths) + assert any("{document_id}" in p for p in paths) + + +class TestCreateTokenEndpoint: + """Tests for POST /admin/auth/token endpoint.""" + + @pytest.fixture + def mock_db(self): + """Create mock AdminDB.""" + db = MagicMock() + db.is_valid_admin_token.return_value = True + return db + + def test_create_token_success(self, mock_db): + """Test successful token creation.""" + from src.web.schemas.admin import AdminTokenCreate + + request = AdminTokenCreate(name="Test Token", expires_in_days=30) + + # The actual endpoint would generate a token + # This tests the schema validation + assert request.name == "Test Token" + assert request.expires_in_days == 30 + + +class TestDocumentUploadEndpoint: + """Tests for POST /admin/documents endpoint.""" + + @pytest.fixture + def sample_pdf_bytes(self): + """Create sample PDF-like bytes.""" + # Minimal PDF header + return b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n" + + @pytest.fixture + def mock_admin_db(self): + """Create mock AdminDB.""" + db = MagicMock() + db.is_valid_admin_token.return_value = True + db.create_document.return_value = TEST_DOC_UUID + return db + + def test_rejects_invalid_extension(self): + """Test that invalid file extensions are rejected.""" + # Schema validation would happen at the route level + allowed = (".pdf", ".png", ".jpg") + file_ext = ".exe" + + assert file_ext not in allowed + + +class TestDocumentListEndpoint: + """Tests for GET /admin/documents endpoint.""" + + @pytest.fixture + def sample_documents(self): + """Create sample documents.""" + return [ + AdminDocument( + document_id=UUID(TEST_DOC_UUID), + admin_token=TEST_TOKEN, + filename="test.pdf", + file_size=1024, + content_type="application/pdf", + file_path="/tmp/test.pdf", + page_count=1, + status="pending", + ), + ] + + def test_validates_status_filter(self): + """Test that invalid status filter is rejected.""" + valid_statuses = ("pending", "auto_labeling", "labeled", "exported") + + assert "invalid_status" not in valid_statuses + assert "pending" in valid_statuses + + +class TestDocumentDetailEndpoint: + """Tests for GET /admin/documents/{document_id} endpoint.""" + + def test_requires_valid_uuid(self): + """Test that invalid UUID is rejected.""" + with pytest.raises(HTTPException) as exc_info: + _validate_uuid("invalid", "document_id") + + assert exc_info.value.status_code == 400 + + +class TestDocumentDeleteEndpoint: + """Tests for DELETE /admin/documents/{document_id} endpoint.""" + + def test_validates_document_id(self): + """Test that document_id is validated.""" + # Valid UUID should not raise + _validate_uuid(TEST_DOC_UUID, "document_id") + + # Invalid should raise + with pytest.raises(HTTPException): + _validate_uuid("bad-id", "document_id") + + +class TestDocumentStatusUpdateEndpoint: + """Tests for PATCH /admin/documents/{document_id}/status endpoint.""" + + def test_validates_status_values(self): + """Test that only valid statuses are accepted.""" + valid_statuses = ("pending", "labeled", "exported") + + assert "pending" in valid_statuses + assert "invalid" not in valid_statuses diff --git a/tests/web/test_admin_routes_enhanced.py b/tests/web/test_admin_routes_enhanced.py new file mode 100644 index 0000000..5dac633 --- /dev/null +++ b/tests/web/test_admin_routes_enhanced.py @@ -0,0 +1,351 @@ +""" +Tests for Enhanced Admin Document Routes (Phase 3). +""" + +import pytest +from datetime import datetime +from uuid import uuid4 + +from fastapi import FastAPI +from fastapi.testclient import TestClient + +from src.web.api.v1.admin.documents import create_admin_router +from src.web.core.auth import validate_admin_token, get_admin_db + + +class MockAdminDocument: + """Mock AdminDocument for testing.""" + + def __init__(self, **kwargs): + self.document_id = kwargs.get('document_id', uuid4()) + self.admin_token = kwargs.get('admin_token', 'test-token') + self.filename = kwargs.get('filename', 'test.pdf') + self.file_size = kwargs.get('file_size', 100000) + self.content_type = kwargs.get('content_type', 'application/pdf') + self.page_count = kwargs.get('page_count', 1) + self.status = kwargs.get('status', 'pending') + self.auto_label_status = kwargs.get('auto_label_status', None) + self.auto_label_error = kwargs.get('auto_label_error', None) + self.upload_source = kwargs.get('upload_source', 'ui') + self.batch_id = kwargs.get('batch_id', None) + self.csv_field_values = kwargs.get('csv_field_values', None) + self.annotation_lock_until = kwargs.get('annotation_lock_until', None) + self.created_at = kwargs.get('created_at', datetime.utcnow()) + self.updated_at = kwargs.get('updated_at', datetime.utcnow()) + + +class MockAnnotation: + """Mock AdminAnnotation for testing.""" + + def __init__(self, **kwargs): + self.annotation_id = kwargs.get('annotation_id', uuid4()) + self.document_id = kwargs.get('document_id') + self.page_number = kwargs.get('page_number', 1) + self.class_id = kwargs.get('class_id', 0) + self.class_name = kwargs.get('class_name', 'invoice_number') + self.bbox_x = kwargs.get('bbox_x', 100.0) + self.bbox_y = kwargs.get('bbox_y', 100.0) + self.bbox_width = kwargs.get('bbox_width', 200.0) + self.bbox_height = kwargs.get('bbox_height', 50.0) + self.x_center = kwargs.get('x_center', 0.5) + self.y_center = kwargs.get('y_center', 0.5) + self.width = kwargs.get('width', 0.3) + self.height = kwargs.get('height', 0.1) + self.text_value = kwargs.get('text_value', 'INV-001') + self.confidence = kwargs.get('confidence', 0.95) + self.source = kwargs.get('source', 'manual') + self.created_at = kwargs.get('created_at', datetime.utcnow()) + + +class MockAdminDB: + """Mock AdminDB for testing enhanced features.""" + + def __init__(self): + self.documents = {} + self.annotations = {} + + def get_documents_by_token( + self, + admin_token, + status=None, + upload_source=None, + has_annotations=None, + auto_label_status=None, + batch_id=None, + limit=20, + offset=0 + ): + """Get filtered documents.""" + docs = list(self.documents.values()) + + # Apply filters + if status: + docs = [d for d in docs if d.status == status] + if upload_source: + docs = [d for d in docs if d.upload_source == upload_source] + if has_annotations is not None: + for d in docs[:]: + ann_count = len(self.annotations.get(str(d.document_id), [])) + if has_annotations and ann_count == 0: + docs.remove(d) + elif not has_annotations and ann_count > 0: + docs.remove(d) + if auto_label_status: + docs = [d for d in docs if d.auto_label_status == auto_label_status] + if batch_id: + docs = [d for d in docs if str(d.batch_id) == str(batch_id)] + + total = len(docs) + return docs[offset:offset+limit], total + + def get_annotations_for_document(self, document_id): + """Get annotations for document.""" + return self.annotations.get(str(document_id), []) + + def count_documents_by_status(self, admin_token): + """Count documents by status.""" + counts = {} + for doc in self.documents.values(): + if doc.admin_token == admin_token: + counts[doc.status] = counts.get(doc.status, 0) + 1 + return counts + + def get_document_by_token(self, document_id, admin_token): + """Get single document by ID and token.""" + doc = self.documents.get(document_id) + if doc and doc.admin_token == admin_token: + return doc + return None + + def get_document_training_tasks(self, document_id): + """Get training tasks that used this document.""" + return [] # No training history in this test + + def get_training_task(self, task_id): + """Get training task by ID.""" + return None # No training tasks in this test + + +@pytest.fixture +def app(): + """Create test FastAPI app.""" + app = FastAPI() + + # Create mock DB + mock_db = MockAdminDB() + + # Add test documents + doc1 = MockAdminDocument( + filename="INV001.pdf", + status="labeled", + upload_source="ui", + auto_label_status=None, + batch_id=None + ) + doc2 = MockAdminDocument( + filename="INV002.pdf", + status="labeled", + upload_source="api", + auto_label_status="completed", + batch_id=uuid4() + ) + doc3 = MockAdminDocument( + filename="INV003.pdf", + status="pending", + upload_source="ui", + auto_label_status=None, # Not auto-labeled yet + batch_id=None + ) + + mock_db.documents[str(doc1.document_id)] = doc1 + mock_db.documents[str(doc2.document_id)] = doc2 + mock_db.documents[str(doc3.document_id)] = doc3 + + # Add annotations to doc1 and doc2 + mock_db.annotations[str(doc1.document_id)] = [ + MockAnnotation( + document_id=doc1.document_id, + class_name="invoice_number", + text_value="INV-001" + ) + ] + mock_db.annotations[str(doc2.document_id)] = [ + MockAnnotation( + document_id=doc2.document_id, + class_id=6, + class_name="amount", + text_value="1500.00" + ), + MockAnnotation( + document_id=doc2.document_id, + class_id=1, + class_name="invoice_date", + text_value="2024-01-15" + ) + ] + + # Override dependencies + app.dependency_overrides[validate_admin_token] = lambda: "test-token" + app.dependency_overrides[get_admin_db] = lambda: mock_db + + # Include router + router = create_admin_router((".pdf", ".png", ".jpg")) + app.include_router(router) + + return app + + +@pytest.fixture +def client(app): + """Create test client.""" + return TestClient(app) + + +class TestEnhancedDocumentList: + """Tests for enhanced document list endpoint.""" + + def test_list_documents_filter_by_upload_source_ui(self, client): + """Test filtering documents by upload_source=ui.""" + response = client.get("/admin/documents?upload_source=ui") + + assert response.status_code == 200 + data = response.json() + assert data["total"] == 2 + assert all(doc["filename"].startswith("INV") for doc in data["documents"]) + + def test_list_documents_filter_by_upload_source_api(self, client): + """Test filtering documents by upload_source=api.""" + response = client.get("/admin/documents?upload_source=api") + + assert response.status_code == 200 + data = response.json() + assert data["total"] == 1 + assert data["documents"][0]["filename"] == "INV002.pdf" + + def test_list_documents_filter_by_has_annotations_true(self, client): + """Test filtering documents with annotations.""" + response = client.get("/admin/documents?has_annotations=true") + + assert response.status_code == 200 + data = response.json() + assert data["total"] == 2 + + def test_list_documents_filter_by_has_annotations_false(self, client): + """Test filtering documents without annotations.""" + response = client.get("/admin/documents?has_annotations=false") + + assert response.status_code == 200 + data = response.json() + assert data["total"] == 1 + + def test_list_documents_filter_by_auto_label_status(self, client): + """Test filtering by auto_label_status.""" + response = client.get("/admin/documents?auto_label_status=completed") + + assert response.status_code == 200 + data = response.json() + assert data["total"] == 1 + assert data["documents"][0]["filename"] == "INV002.pdf" + + def test_list_documents_filter_by_batch_id(self, client): + """Test filtering by batch_id.""" + # Get a batch_id from the test data + response_all = client.get("/admin/documents?upload_source=api") + batch_id = response_all.json()["documents"][0]["batch_id"] + + response = client.get(f"/admin/documents?batch_id={batch_id}") + + assert response.status_code == 200 + data = response.json() + assert data["total"] == 1 + + def test_list_documents_combined_filters(self, client): + """Test combining multiple filters.""" + response = client.get( + "/admin/documents?status=labeled&upload_source=api" + ) + + assert response.status_code == 200 + data = response.json() + assert data["total"] == 1 + assert data["documents"][0]["filename"] == "INV002.pdf" + + def test_document_item_includes_new_fields(self, client): + """Test DocumentItem includes new Phase 2/3 fields.""" + response = client.get("/admin/documents?upload_source=api") + + assert response.status_code == 200 + data = response.json() + doc = data["documents"][0] + + # Check new fields exist + assert "upload_source" in doc + assert doc["upload_source"] == "api" + assert "batch_id" in doc + assert doc["batch_id"] is not None + assert "can_annotate" in doc + assert isinstance(doc["can_annotate"], bool) + + +class TestEnhancedDocumentDetail: + """Tests for enhanced document detail endpoint.""" + + def test_document_detail_includes_new_fields(self, client, app): + """Test DocumentDetailResponse includes new Phase 2/3 fields.""" + # Get a document ID from list + response = client.get("/admin/documents?upload_source=api") + assert response.status_code == 200 + doc_list = response.json() + document_id = doc_list["documents"][0]["document_id"] + + # Get document detail + response = client.get(f"/admin/documents/{document_id}") + assert response.status_code == 200 + doc = response.json() + + # Check new fields exist + assert "upload_source" in doc + assert doc["upload_source"] == "api" + assert "batch_id" in doc + assert doc["batch_id"] is not None + assert "can_annotate" in doc + assert isinstance(doc["can_annotate"], bool) + assert "csv_field_values" in doc + assert "annotation_lock_until" in doc + + def test_document_detail_ui_upload_defaults(self, client, app): + """Test UI-uploaded document has correct defaults.""" + # Get a UI-uploaded document + response = client.get("/admin/documents?upload_source=ui") + assert response.status_code == 200 + doc_list = response.json() + document_id = doc_list["documents"][0]["document_id"] + + # Get document detail + response = client.get(f"/admin/documents/{document_id}") + assert response.status_code == 200 + doc = response.json() + + # UI uploads should have these defaults + assert doc["upload_source"] == "ui" + assert doc["batch_id"] is None + assert doc["csv_field_values"] is None + assert doc["can_annotate"] is True + assert doc["annotation_lock_until"] is None + + def test_document_detail_with_annotations(self, client, app): + """Test document detail includes annotations.""" + # Get a document with annotations + response = client.get("/admin/documents?has_annotations=true") + assert response.status_code == 200 + doc_list = response.json() + document_id = doc_list["documents"][0]["document_id"] + + # Get document detail + response = client.get(f"/admin/documents/{document_id}") + assert response.status_code == 200 + doc = response.json() + + # Should have annotations + assert "annotations" in doc + assert len(doc["annotations"]) > 0 diff --git a/tests/web/test_admin_training.py b/tests/web/test_admin_training.py new file mode 100644 index 0000000..62e84ac --- /dev/null +++ b/tests/web/test_admin_training.py @@ -0,0 +1,247 @@ +""" +Tests for Admin Training Routes and Scheduler. +""" + +import pytest +from datetime import datetime, timedelta +from unittest.mock import MagicMock, patch +from uuid import UUID + +from src.data.admin_models import TrainingTask, TrainingLog +from src.web.api.v1.admin.training import _validate_uuid, create_training_router +from src.web.core.scheduler import ( + TrainingScheduler, + get_training_scheduler, + start_scheduler, + stop_scheduler, +) +from src.web.schemas.admin import ( + TrainingConfig, + TrainingStatus, + TrainingTaskCreate, + TrainingType, +) + + +# Test UUIDs +TEST_TASK_UUID = "770e8400-e29b-41d4-a716-446655440002" +TEST_TOKEN = "test-admin-token-12345" + + +class TestTrainingRouterCreation: + """Tests for training router creation.""" + + def test_creates_router_with_endpoints(self): + """Test router is created with expected endpoints.""" + router = create_training_router() + + # Get route paths (include prefix) + paths = [route.path for route in router.routes] + + # Paths include the /admin/training prefix + assert any("/tasks" in p for p in paths) + assert any("{task_id}" in p for p in paths) + assert any("cancel" in p for p in paths) + assert any("logs" in p for p in paths) + assert any("export" in p for p in paths) + + +class TestTrainingConfigSchema: + """Tests for TrainingConfig schema.""" + + def test_default_config(self): + """Test default training configuration.""" + config = TrainingConfig() + + assert config.model_name == "yolo11n.pt" + assert config.epochs == 100 + assert config.batch_size == 16 + assert config.image_size == 640 + assert config.learning_rate == 0.01 + assert config.device == "0" + + def test_custom_config(self): + """Test custom training configuration.""" + config = TrainingConfig( + model_name="yolo11s.pt", + epochs=50, + batch_size=8, + image_size=416, + learning_rate=0.001, + device="cpu", + ) + + assert config.model_name == "yolo11s.pt" + assert config.epochs == 50 + assert config.batch_size == 8 + + def test_config_validation(self): + """Test config validation constraints.""" + # Epochs must be 1-1000 + config = TrainingConfig(epochs=1) + assert config.epochs == 1 + + config = TrainingConfig(epochs=1000) + assert config.epochs == 1000 + + +class TestTrainingTaskCreateSchema: + """Tests for TrainingTaskCreate schema.""" + + def test_minimal_task(self): + """Test minimal task creation.""" + task = TrainingTaskCreate(name="Test Training") + + assert task.name == "Test Training" + assert task.task_type == TrainingType.TRAIN + assert task.description is None + assert task.scheduled_at is None + + def test_scheduled_task(self): + """Test scheduled task creation.""" + scheduled_time = datetime.utcnow() + timedelta(hours=1) + task = TrainingTaskCreate( + name="Scheduled Training", + scheduled_at=scheduled_time, + ) + + assert task.scheduled_at == scheduled_time + + def test_recurring_task(self): + """Test recurring task with cron expression.""" + task = TrainingTaskCreate( + name="Recurring Training", + cron_expression="0 0 * * 0", # Every Sunday at midnight + ) + + assert task.cron_expression == "0 0 * * 0" + + +class TestTrainingTaskModel: + """Tests for TrainingTask model.""" + + def test_task_creation(self): + """Test training task model creation.""" + task = TrainingTask( + admin_token=TEST_TOKEN, + name="Test Task", + task_type="train", + status="pending", + ) + + assert task.name == "Test Task" + assert task.task_type == "train" + assert task.status == "pending" + + def test_task_with_config(self): + """Test task with configuration.""" + config = { + "model_name": "yolo11n.pt", + "epochs": 100, + } + task = TrainingTask( + admin_token=TEST_TOKEN, + name="Configured Task", + task_type="train", + config=config, + ) + + assert task.config == config + assert task.config["epochs"] == 100 + + +class TestTrainingLogModel: + """Tests for TrainingLog model.""" + + def test_log_creation(self): + """Test training log creation.""" + log = TrainingLog( + task_id=UUID(TEST_TASK_UUID), + level="INFO", + message="Training started", + ) + + assert str(log.task_id) == TEST_TASK_UUID + assert log.level == "INFO" + assert log.message == "Training started" + + def test_log_with_details(self): + """Test log with additional details.""" + details = { + "epoch": 10, + "loss": 0.5, + "mAP": 0.85, + } + log = TrainingLog( + task_id=UUID(TEST_TASK_UUID), + level="INFO", + message="Epoch completed", + details=details, + ) + + assert log.details == details + assert log.details["epoch"] == 10 + + +class TestTrainingScheduler: + """Tests for TrainingScheduler.""" + + @pytest.fixture + def scheduler(self): + """Create a scheduler for testing.""" + return TrainingScheduler(check_interval_seconds=1) + + def test_scheduler_creation(self, scheduler): + """Test scheduler creation.""" + assert scheduler._check_interval == 1 + assert scheduler._running is False + assert scheduler._thread is None + + def test_scheduler_start_stop(self, scheduler): + """Test scheduler start and stop.""" + with patch.object(scheduler, "_check_pending_tasks"): + scheduler.start() + assert scheduler._running is True + assert scheduler._thread is not None + + scheduler.stop() + assert scheduler._running is False + + def test_scheduler_singleton(self): + """Test get_training_scheduler returns singleton.""" + # Reset any existing scheduler + stop_scheduler() + + s1 = get_training_scheduler() + s2 = get_training_scheduler() + + assert s1 is s2 + + # Cleanup + stop_scheduler() + + +class TestTrainingStatusEnum: + """Tests for TrainingStatus enum.""" + + def test_all_statuses(self): + """Test all training statuses are defined.""" + statuses = [s.value for s in TrainingStatus] + + assert "pending" in statuses + assert "scheduled" in statuses + assert "running" in statuses + assert "completed" in statuses + assert "failed" in statuses + assert "cancelled" in statuses + + +class TestTrainingTypeEnum: + """Tests for TrainingType enum.""" + + def test_all_types(self): + """Test all training types are defined.""" + types = [t.value for t in TrainingType] + + assert "train" in types + assert "finetune" in types diff --git a/tests/web/test_annotation_locks.py b/tests/web/test_annotation_locks.py new file mode 100644 index 0000000..dfff46d --- /dev/null +++ b/tests/web/test_annotation_locks.py @@ -0,0 +1,276 @@ +""" +Tests for Annotation Lock Mechanism (Phase 3.3). +""" + +import pytest +from datetime import datetime, timedelta, timezone +from uuid import uuid4 + +from fastapi import FastAPI +from fastapi.testclient import TestClient + +from src.web.api.v1.admin.documents import create_admin_router +from src.web.core.auth import validate_admin_token, get_admin_db + + +class MockAdminDocument: + """Mock AdminDocument for testing.""" + + def __init__(self, **kwargs): + self.document_id = kwargs.get('document_id', uuid4()) + self.admin_token = kwargs.get('admin_token', 'test-token') + self.filename = kwargs.get('filename', 'test.pdf') + self.file_size = kwargs.get('file_size', 100000) + self.content_type = kwargs.get('content_type', 'application/pdf') + self.page_count = kwargs.get('page_count', 1) + self.status = kwargs.get('status', 'pending') + self.auto_label_status = kwargs.get('auto_label_status', None) + self.auto_label_error = kwargs.get('auto_label_error', None) + self.upload_source = kwargs.get('upload_source', 'ui') + self.batch_id = kwargs.get('batch_id', None) + self.csv_field_values = kwargs.get('csv_field_values', None) + self.annotation_lock_until = kwargs.get('annotation_lock_until', None) + self.created_at = kwargs.get('created_at', datetime.utcnow()) + self.updated_at = kwargs.get('updated_at', datetime.utcnow()) + + +class MockAdminDB: + """Mock AdminDB for testing annotation locks.""" + + def __init__(self): + self.documents = {} + + def get_document_by_token(self, document_id, admin_token): + """Get single document by ID and token.""" + doc = self.documents.get(document_id) + if doc and doc.admin_token == admin_token: + return doc + return None + + def acquire_annotation_lock(self, document_id, admin_token, duration_seconds=300): + """Acquire annotation lock for a document.""" + doc = self.documents.get(document_id) + if not doc or doc.admin_token != admin_token: + return None + + # Check if already locked + now = datetime.now(timezone.utc) + if doc.annotation_lock_until and doc.annotation_lock_until > now: + return None + + # Acquire lock + doc.annotation_lock_until = now + timedelta(seconds=duration_seconds) + return doc + + def release_annotation_lock(self, document_id, admin_token, force=False): + """Release annotation lock for a document.""" + doc = self.documents.get(document_id) + if not doc or doc.admin_token != admin_token: + return None + + # Release lock + doc.annotation_lock_until = None + return doc + + def extend_annotation_lock(self, document_id, admin_token, additional_seconds=300): + """Extend an existing annotation lock.""" + doc = self.documents.get(document_id) + if not doc or doc.admin_token != admin_token: + return None + + # Check if lock exists and is still valid + now = datetime.now(timezone.utc) + if not doc.annotation_lock_until or doc.annotation_lock_until <= now: + return None + + # Extend lock + doc.annotation_lock_until = doc.annotation_lock_until + timedelta(seconds=additional_seconds) + return doc + + +@pytest.fixture +def app(): + """Create test FastAPI app.""" + app = FastAPI() + + # Create mock DB + mock_db = MockAdminDB() + + # Add test document + doc1 = MockAdminDocument( + filename="INV001.pdf", + status="pending", + upload_source="ui", + ) + + mock_db.documents[str(doc1.document_id)] = doc1 + + # Override dependencies + app.dependency_overrides[validate_admin_token] = lambda: "test-token" + app.dependency_overrides[get_admin_db] = lambda: mock_db + + # Include router + router = create_admin_router((".pdf", ".png", ".jpg")) + app.include_router(router) + + return app + + +@pytest.fixture +def client(app): + """Create test client.""" + return TestClient(app) + + +@pytest.fixture +def document_id(app): + """Get document ID from the mock DB.""" + mock_db = app.dependency_overrides[get_admin_db]() + return str(list(mock_db.documents.keys())[0]) + + +class TestAnnotationLocks: + """Tests for annotation lock endpoints.""" + + def test_acquire_lock_success(self, client, document_id): + """Test successfully acquiring an annotation lock.""" + response = client.post( + f"/admin/documents/{document_id}/lock", + json={"duration_seconds": 300} + ) + + assert response.status_code == 200 + data = response.json() + assert data["document_id"] == document_id + assert data["locked"] is True + assert data["lock_expires_at"] is not None + assert "Lock acquired for 300 seconds" in data["message"] + + def test_acquire_lock_already_locked(self, client, document_id): + """Test acquiring lock on already locked document.""" + # First lock + response1 = client.post( + f"/admin/documents/{document_id}/lock", + json={"duration_seconds": 300} + ) + assert response1.status_code == 200 + + # Try to lock again + response2 = client.post( + f"/admin/documents/{document_id}/lock", + json={"duration_seconds": 300} + ) + assert response2.status_code == 409 + assert "already locked" in response2.json()["detail"] + + def test_release_lock_success(self, client, document_id): + """Test successfully releasing an annotation lock.""" + # First acquire lock + client.post( + f"/admin/documents/{document_id}/lock", + json={"duration_seconds": 300} + ) + + # Then release it + response = client.delete(f"/admin/documents/{document_id}/lock") + + assert response.status_code == 200 + data = response.json() + assert data["document_id"] == document_id + assert data["locked"] is False + assert data["lock_expires_at"] is None + assert "released successfully" in data["message"] + + def test_release_lock_not_locked(self, client, document_id): + """Test releasing lock on unlocked document.""" + response = client.delete(f"/admin/documents/{document_id}/lock") + + # Should succeed even if not locked + assert response.status_code == 200 + data = response.json() + assert data["locked"] is False + + def test_extend_lock_success(self, client, document_id): + """Test successfully extending an annotation lock.""" + # First acquire lock + response1 = client.post( + f"/admin/documents/{document_id}/lock", + json={"duration_seconds": 300} + ) + original_expiry = response1.json()["lock_expires_at"] + + # Extend lock + response2 = client.patch( + f"/admin/documents/{document_id}/lock", + json={"duration_seconds": 300} + ) + + assert response2.status_code == 200 + data = response2.json() + assert data["document_id"] == document_id + assert data["locked"] is True + assert data["lock_expires_at"] != original_expiry + assert "extended by 300 seconds" in data["message"] + + def test_extend_lock_not_locked(self, client, document_id): + """Test extending lock on unlocked document.""" + response = client.patch( + f"/admin/documents/{document_id}/lock", + json={"duration_seconds": 300} + ) + + assert response.status_code == 409 + assert "doesn't exist or has expired" in response.json()["detail"] + + def test_acquire_lock_custom_duration(self, client, document_id): + """Test acquiring lock with custom duration.""" + response = client.post( + f"/admin/documents/{document_id}/lock", + json={"duration_seconds": 600} + ) + + assert response.status_code == 200 + data = response.json() + assert "Lock acquired for 600 seconds" in data["message"] + + def test_acquire_lock_invalid_document(self, client): + """Test acquiring lock on non-existent document.""" + fake_id = str(uuid4()) + response = client.post( + f"/admin/documents/{fake_id}/lock", + json={"duration_seconds": 300} + ) + + assert response.status_code == 404 + assert "not found" in response.json()["detail"] + + def test_lock_lifecycle(self, client, document_id): + """Test complete lock lifecycle: acquire -> extend -> release.""" + # Acquire + response1 = client.post( + f"/admin/documents/{document_id}/lock", + json={"duration_seconds": 300} + ) + assert response1.status_code == 200 + assert response1.json()["locked"] is True + + # Extend + response2 = client.patch( + f"/admin/documents/{document_id}/lock", + json={"duration_seconds": 300} + ) + assert response2.status_code == 200 + assert response2.json()["locked"] is True + + # Release + response3 = client.delete(f"/admin/documents/{document_id}/lock") + assert response3.status_code == 200 + assert response3.json()["locked"] is False + + # Verify can acquire again after release + response4 = client.post( + f"/admin/documents/{document_id}/lock", + json={"duration_seconds": 300} + ) + assert response4.status_code == 200 + assert response4.json()["locked"] is True diff --git a/tests/web/test_annotation_phase5.py b/tests/web/test_annotation_phase5.py new file mode 100644 index 0000000..cba8c20 --- /dev/null +++ b/tests/web/test_annotation_phase5.py @@ -0,0 +1,420 @@ +""" +Tests for Phase 5: Annotation Enhancement (Verification and Override) +""" + +import pytest +from datetime import datetime +from uuid import uuid4 + +from fastapi import FastAPI +from fastapi.testclient import TestClient + +from src.web.api.v1.admin.annotations import create_annotation_router +from src.web.core.auth import validate_admin_token, get_admin_db + + +class MockAdminDocument: + """Mock AdminDocument for testing.""" + + def __init__(self, **kwargs): + self.document_id = kwargs.get('document_id', uuid4()) + self.admin_token = kwargs.get('admin_token', 'test-token') + self.filename = kwargs.get('filename', 'test.pdf') + self.file_size = kwargs.get('file_size', 100000) + self.content_type = kwargs.get('content_type', 'application/pdf') + self.page_count = kwargs.get('page_count', 1) + self.status = kwargs.get('status', 'labeled') + self.auto_label_status = kwargs.get('auto_label_status', None) + self.created_at = kwargs.get('created_at', datetime.utcnow()) + self.updated_at = kwargs.get('updated_at', datetime.utcnow()) + + +class MockAnnotation: + """Mock AdminAnnotation for testing.""" + + def __init__(self, **kwargs): + self.annotation_id = kwargs.get('annotation_id', uuid4()) + self.document_id = kwargs.get('document_id') + self.page_number = kwargs.get('page_number', 1) + self.class_id = kwargs.get('class_id', 0) + self.class_name = kwargs.get('class_name', 'invoice_number') + self.bbox_x = kwargs.get('bbox_x', 100) + self.bbox_y = kwargs.get('bbox_y', 100) + self.bbox_width = kwargs.get('bbox_width', 200) + self.bbox_height = kwargs.get('bbox_height', 50) + self.x_center = kwargs.get('x_center', 0.5) + self.y_center = kwargs.get('y_center', 0.5) + self.width = kwargs.get('width', 0.3) + self.height = kwargs.get('height', 0.1) + self.text_value = kwargs.get('text_value', 'INV-001') + self.confidence = kwargs.get('confidence', 0.95) + self.source = kwargs.get('source', 'auto') + self.is_verified = kwargs.get('is_verified', False) + self.verified_at = kwargs.get('verified_at', None) + self.verified_by = kwargs.get('verified_by', None) + self.override_source = kwargs.get('override_source', None) + self.original_annotation_id = kwargs.get('original_annotation_id', None) + self.created_at = kwargs.get('created_at', datetime.utcnow()) + self.updated_at = kwargs.get('updated_at', datetime.utcnow()) + + +class MockAnnotationHistory: + """Mock AnnotationHistory for testing.""" + + def __init__(self, **kwargs): + self.history_id = kwargs.get('history_id', uuid4()) + self.annotation_id = kwargs.get('annotation_id') + self.document_id = kwargs.get('document_id') + self.action = kwargs.get('action', 'override') + self.previous_value = kwargs.get('previous_value', {}) + self.new_value = kwargs.get('new_value', {}) + self.changed_by = kwargs.get('changed_by', 'test-token') + self.change_reason = kwargs.get('change_reason', None) + self.created_at = kwargs.get('created_at', datetime.utcnow()) + + +class MockAdminDB: + """Mock AdminDB for testing Phase 5.""" + + def __init__(self): + self.documents = {} + self.annotations = {} + self.annotation_history = {} + + def get_document_by_token(self, document_id, admin_token): + """Get document by ID and token.""" + doc = self.documents.get(str(document_id)) + if doc and doc.admin_token == admin_token: + return doc + return None + + def verify_annotation(self, annotation_id, admin_token): + """Mark annotation as verified.""" + annotation = self.annotations.get(str(annotation_id)) + if annotation: + annotation.is_verified = True + annotation.verified_at = datetime.utcnow() + annotation.verified_by = admin_token + return annotation + return None + + def override_annotation( + self, + annotation_id, + admin_token, + change_reason=None, + **updates, + ): + """Override an annotation.""" + annotation = self.annotations.get(str(annotation_id)) + if annotation: + # Apply updates + for key, value in updates.items(): + if hasattr(annotation, key): + setattr(annotation, key, value) + + # Mark as overridden if was auto-generated + if annotation.source == "auto": + annotation.override_source = "auto" + annotation.source = "manual" + + # Create history record + history = MockAnnotationHistory( + annotation_id=uuid4().hex if isinstance(annotation_id, str) else annotation_id, + document_id=annotation.document_id, + action="override", + changed_by=admin_token, + change_reason=change_reason, + ) + self.annotation_history[str(annotation.annotation_id)] = [history] + + return annotation + return None + + def get_annotation_history(self, annotation_id): + """Get annotation history.""" + return self.annotation_history.get(str(annotation_id), []) + + +@pytest.fixture +def app(): + """Create test FastAPI app.""" + app = FastAPI() + + # Create mock DB + mock_db = MockAdminDB() + + # Add test document + doc1 = MockAdminDocument( + filename="TEST001.pdf", + status="labeled", + ) + mock_db.documents[str(doc1.document_id)] = doc1 + + # Add test annotations + ann1 = MockAnnotation( + document_id=doc1.document_id, + class_id=0, + class_name="invoice_number", + text_value="INV-001", + source="auto", + confidence=0.95, + ) + ann2 = MockAnnotation( + document_id=doc1.document_id, + class_id=6, + class_name="amount", + text_value="1500.00", + source="auto", + confidence=0.98, + ) + + mock_db.annotations[str(ann1.annotation_id)] = ann1 + mock_db.annotations[str(ann2.annotation_id)] = ann2 + + # Store document ID and annotation IDs for tests + app.state.document_id = str(doc1.document_id) + app.state.annotation_id_1 = str(ann1.annotation_id) + app.state.annotation_id_2 = str(ann2.annotation_id) + + # Override dependencies + app.dependency_overrides[validate_admin_token] = lambda: "test-token" + app.dependency_overrides[get_admin_db] = lambda: mock_db + + # Include router + router = create_annotation_router() + app.include_router(router) + + return app + + +@pytest.fixture +def client(app): + """Create test client.""" + return TestClient(app) + + +class TestAnnotationVerification: + """Tests for POST /admin/documents/{document_id}/annotations/{annotation_id}/verify endpoint.""" + + def test_verify_annotation_success(self, client, app): + """Test successfully verifying an annotation.""" + document_id = app.state.document_id + annotation_id = app.state.annotation_id_1 + + response = client.post( + f"/admin/documents/{document_id}/annotations/{annotation_id}/verify" + ) + + assert response.status_code == 200 + data = response.json() + assert data["annotation_id"] == annotation_id + assert data["is_verified"] is True + assert data["verified_at"] is not None + assert data["verified_by"] == "test-token" + assert "verified successfully" in data["message"].lower() + + def test_verify_annotation_not_found(self, client, app): + """Test verifying non-existent annotation.""" + document_id = app.state.document_id + fake_annotation_id = str(uuid4()) + + response = client.post( + f"/admin/documents/{document_id}/annotations/{fake_annotation_id}/verify" + ) + + assert response.status_code == 404 + assert "not found" in response.json()["detail"].lower() + + def test_verify_annotation_document_not_found(self, client): + """Test verifying annotation with non-existent document.""" + fake_document_id = str(uuid4()) + fake_annotation_id = str(uuid4()) + + response = client.post( + f"/admin/documents/{fake_document_id}/annotations/{fake_annotation_id}/verify" + ) + + assert response.status_code == 404 + assert "not found" in response.json()["detail"].lower() + + def test_verify_annotation_invalid_uuid(self, client, app): + """Test verifying annotation with invalid UUID format.""" + document_id = app.state.document_id + + response = client.post( + f"/admin/documents/{document_id}/annotations/invalid-uuid/verify" + ) + + assert response.status_code == 400 + assert "invalid" in response.json()["detail"].lower() + + +class TestAnnotationOverride: + """Tests for PATCH /admin/documents/{document_id}/annotations/{annotation_id}/override endpoint.""" + + def test_override_annotation_text_value(self, client, app): + """Test overriding annotation text value.""" + document_id = app.state.document_id + annotation_id = app.state.annotation_id_1 + + response = client.patch( + f"/admin/documents/{document_id}/annotations/{annotation_id}/override", + json={ + "text_value": "INV-001-CORRECTED", + "reason": "OCR error correction" + } + ) + + assert response.status_code == 200 + data = response.json() + assert data["annotation_id"] == annotation_id + assert data["source"] == "manual" + assert data["override_source"] == "auto" + assert "successfully" in data["message"].lower() + assert "history_id" in data + + def test_override_annotation_bbox(self, client, app): + """Test overriding annotation bounding box.""" + document_id = app.state.document_id + annotation_id = app.state.annotation_id_1 + + response = client.patch( + f"/admin/documents/{document_id}/annotations/{annotation_id}/override", + json={ + "bbox": { + "x": 110, + "y": 205, + "width": 195, + "height": 48 + }, + "reason": "Bbox adjustment" + } + ) + + assert response.status_code == 200 + data = response.json() + assert data["annotation_id"] == annotation_id + assert data["source"] == "manual" + + def test_override_annotation_class(self, client, app): + """Test overriding annotation class.""" + document_id = app.state.document_id + annotation_id = app.state.annotation_id_1 + + response = client.patch( + f"/admin/documents/{document_id}/annotations/{annotation_id}/override", + json={ + "class_id": 1, + "class_name": "invoice_date", + "reason": "Wrong field classification" + } + ) + + assert response.status_code == 200 + data = response.json() + assert data["annotation_id"] == annotation_id + + def test_override_annotation_multiple_fields(self, client, app): + """Test overriding multiple annotation fields at once.""" + document_id = app.state.document_id + annotation_id = app.state.annotation_id_2 + + response = client.patch( + f"/admin/documents/{document_id}/annotations/{annotation_id}/override", + json={ + "text_value": "1550.00", + "bbox": { + "x": 120, + "y": 210, + "width": 180, + "height": 45 + }, + "reason": "Multiple corrections" + } + ) + + assert response.status_code == 200 + data = response.json() + assert data["annotation_id"] == annotation_id + + def test_override_annotation_no_updates(self, client, app): + """Test overriding annotation without providing any updates.""" + document_id = app.state.document_id + annotation_id = app.state.annotation_id_1 + + response = client.patch( + f"/admin/documents/{document_id}/annotations/{annotation_id}/override", + json={} + ) + + assert response.status_code == 400 + assert "no updates" in response.json()["detail"].lower() + + def test_override_annotation_not_found(self, client, app): + """Test overriding non-existent annotation.""" + document_id = app.state.document_id + fake_annotation_id = str(uuid4()) + + response = client.patch( + f"/admin/documents/{document_id}/annotations/{fake_annotation_id}/override", + json={ + "text_value": "TEST" + } + ) + + assert response.status_code == 404 + assert "not found" in response.json()["detail"].lower() + + def test_override_annotation_document_not_found(self, client): + """Test overriding annotation with non-existent document.""" + fake_document_id = str(uuid4()) + fake_annotation_id = str(uuid4()) + + response = client.patch( + f"/admin/documents/{fake_document_id}/annotations/{fake_annotation_id}/override", + json={ + "text_value": "TEST" + } + ) + + assert response.status_code == 404 + assert "not found" in response.json()["detail"].lower() + + def test_override_annotation_creates_history(self, client, app): + """Test that overriding annotation creates history record.""" + document_id = app.state.document_id + annotation_id = app.state.annotation_id_1 + + response = client.patch( + f"/admin/documents/{document_id}/annotations/{annotation_id}/override", + json={ + "text_value": "INV-CORRECTED", + "reason": "Test history creation" + } + ) + + assert response.status_code == 200 + data = response.json() + # History ID should be present and valid + assert "history_id" in data + assert data["history_id"] != "" + + def test_override_annotation_with_reason(self, client, app): + """Test overriding annotation with change reason.""" + document_id = app.state.document_id + annotation_id = app.state.annotation_id_1 + + change_reason = "Correcting OCR misread" + response = client.patch( + f"/admin/documents/{document_id}/annotations/{annotation_id}/override", + json={ + "text_value": "INV-002", + "reason": change_reason + } + ) + + assert response.status_code == 200 + # Reason is stored in history, not returned in response + data = response.json() + assert data["annotation_id"] == annotation_id diff --git a/tests/web/test_async_queue.py b/tests/web/test_async_queue.py new file mode 100644 index 0000000..1db16cf --- /dev/null +++ b/tests/web/test_async_queue.py @@ -0,0 +1,217 @@ +""" +Tests for the AsyncTaskQueue class. +""" + +import tempfile +import time +from datetime import datetime +from pathlib import Path +from threading import Event +from unittest.mock import MagicMock + +import pytest + +from src.web.workers.async_queue import AsyncTask, AsyncTaskQueue + + +class TestAsyncTask: + """Tests for AsyncTask dataclass.""" + + def test_create_task(self): + """Test creating an AsyncTask.""" + task = AsyncTask( + request_id="test-id", + api_key="test-key", + file_path=Path("/tmp/test.pdf"), + filename="test.pdf", + ) + + assert task.request_id == "test-id" + assert task.api_key == "test-key" + assert task.filename == "test.pdf" + assert task.priority == 0 + assert task.created_at is not None + + +class TestAsyncTaskQueue: + """Tests for AsyncTaskQueue.""" + + def test_init(self): + """Test queue initialization.""" + queue = AsyncTaskQueue(max_size=50, worker_count=2) + + assert queue._worker_count == 2 + assert queue._queue.maxsize == 50 + assert not queue._started + + def test_submit_task(self, task_queue, sample_task): + """Test submitting a task to the queue.""" + success = task_queue.submit(sample_task) + + assert success is True + assert task_queue.get_queue_depth() == 1 + + def test_submit_when_full(self, sample_task): + """Test submitting to a full queue.""" + queue = AsyncTaskQueue(max_size=1, worker_count=1) + + # Submit first task + queue.submit(sample_task) + + # Create second task + task2 = AsyncTask( + request_id="test-2", + api_key="test-key", + file_path=sample_task.file_path, + filename="test2.pdf", + ) + + # Queue should be full + success = queue.submit(task2) + assert success is False + + def test_get_queue_depth(self, task_queue, sample_task): + """Test getting queue depth.""" + assert task_queue.get_queue_depth() == 0 + + task_queue.submit(sample_task) + assert task_queue.get_queue_depth() == 1 + + def test_start_and_stop(self, task_queue): + """Test starting and stopping the queue.""" + handler = MagicMock() + + task_queue.start(handler) + assert task_queue._started is True + assert task_queue.is_running is True + assert len(task_queue._workers) == 1 + + task_queue.stop(timeout=5.0) + assert task_queue._started is False + assert task_queue.is_running is False + assert len(task_queue._workers) == 0 + + def test_worker_processes_task(self, sample_task): + """Test that worker thread processes tasks.""" + queue = AsyncTaskQueue(max_size=10, worker_count=1) + processed = Event() + + def handler(task): + processed.set() + + queue.start(handler) + queue.submit(sample_task) + + # Wait for processing + assert processed.wait(timeout=5.0) + + queue.stop() + + def test_worker_handles_errors(self, sample_task): + """Test that worker handles errors gracefully.""" + queue = AsyncTaskQueue(max_size=10, worker_count=1) + error_handled = Event() + + def failing_handler(task): + error_handled.set() + raise ValueError("Test error") + + queue.start(failing_handler) + queue.submit(sample_task) + + # Should not crash + assert error_handled.wait(timeout=5.0) + time.sleep(0.5) # Give time for error handling + + assert queue.is_running + + queue.stop() + + def test_processing_tracking(self, task_queue, sample_task): + """Test tracking of processing tasks.""" + processed = Event() + + def slow_handler(task): + processed.set() + time.sleep(0.5) + + task_queue.start(slow_handler) + task_queue.submit(sample_task) + + # Wait for processing to start + assert processed.wait(timeout=5.0) + + # Task should be in processing set + assert task_queue.get_processing_count() == 1 + assert task_queue.is_processing(sample_task.request_id) + + # Wait for completion + time.sleep(1.0) + + assert task_queue.get_processing_count() == 0 + assert not task_queue.is_processing(sample_task.request_id) + + task_queue.stop() + + def test_multiple_workers(self, sample_task): + """Test queue with multiple workers.""" + queue = AsyncTaskQueue(max_size=10, worker_count=3) + processed_count = [] + + def handler(task): + processed_count.append(task.request_id) + time.sleep(0.1) + + queue.start(handler) + + # Submit multiple tasks + for i in range(5): + task = AsyncTask( + request_id=f"task-{i}", + api_key="test-key", + file_path=sample_task.file_path, + filename=f"test-{i}.pdf", + ) + queue.submit(task) + + # Wait for all tasks + time.sleep(2.0) + + assert len(processed_count) == 5 + + queue.stop() + + def test_graceful_shutdown(self, sample_task): + """Test graceful shutdown waits for current task.""" + queue = AsyncTaskQueue(max_size=10, worker_count=1) + started = Event() + finished = Event() + + def slow_handler(task): + started.set() + time.sleep(0.5) + finished.set() + + queue.start(slow_handler) + queue.submit(sample_task) + + # Wait for processing to start + assert started.wait(timeout=5.0) + + # Stop should wait for task to finish + queue.stop(timeout=5.0) + + assert finished.is_set() + + def test_double_start(self, task_queue): + """Test that starting twice doesn't create duplicate workers.""" + handler = MagicMock() + + task_queue.start(handler) + assert len(task_queue._workers) == 1 + + # Starting again should not add more workers + task_queue.start(handler) + assert len(task_queue._workers) == 1 + + task_queue.stop() diff --git a/tests/web/test_async_routes.py b/tests/web/test_async_routes.py new file mode 100644 index 0000000..b7dfb6f --- /dev/null +++ b/tests/web/test_async_routes.py @@ -0,0 +1,409 @@ +""" +Tests for the async API routes. +""" + +import tempfile +from datetime import datetime, timedelta +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest +from fastapi import FastAPI +from fastapi.testclient import TestClient + +from src.data.async_request_db import ApiKeyConfig, AsyncRequest, AsyncRequestDB +from src.web.api.v1.async_api.routes import create_async_router, set_async_service +from src.web.services.async_processing import AsyncSubmitResult +from src.web.dependencies import init_dependencies +from src.web.rate_limiter import RateLimiter, RateLimitStatus +from src.web.schemas.inference import AsyncStatus + +# Valid UUID for testing +TEST_REQUEST_UUID = "550e8400-e29b-41d4-a716-446655440000" +INVALID_UUID = "nonexistent-id" + + +@pytest.fixture +def mock_async_service(): + """Create a mock AsyncProcessingService.""" + service = MagicMock() + + # Mock config + mock_config = MagicMock() + mock_config.max_file_size_mb = 50 + service._async_config = mock_config + + # Default submit result + service.submit_request.return_value = AsyncSubmitResult( + success=True, + request_id="test-request-id", + estimated_wait_seconds=30, + ) + + return service + + +@pytest.fixture +def mock_rate_limiter(mock_db): + """Create a mock RateLimiter.""" + limiter = MagicMock(spec=RateLimiter) + + # Default: allow all requests + limiter.check_submit_limit.return_value = RateLimitStatus( + allowed=True, + remaining_requests=9, + reset_at=datetime.utcnow() + timedelta(seconds=60), + ) + limiter.check_poll_limit.return_value = RateLimitStatus( + allowed=True, + remaining_requests=999, + reset_at=datetime.utcnow(), + ) + limiter.get_rate_limit_headers.return_value = {} + + return limiter + + +@pytest.fixture +def app(mock_db, mock_rate_limiter, mock_async_service): + """Create a test FastAPI app with async routes.""" + app = FastAPI() + + # Initialize dependencies + init_dependencies(mock_db, mock_rate_limiter) + set_async_service(mock_async_service) + + # Add routes + router = create_async_router(allowed_extensions=(".pdf", ".png", ".jpg", ".jpeg")) + app.include_router(router, prefix="/api/v1") + + return app + + +@pytest.fixture +def client(app): + """Create a test client.""" + return TestClient(app) + + +class TestAsyncSubmitEndpoint: + """Tests for POST /api/v1/async/submit.""" + + def test_submit_success(self, client, mock_async_service): + """Test successful submission.""" + with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f: + f.write(b"fake pdf content") + f.seek(0) + + response = client.post( + "/api/v1/async/submit", + files={"file": ("test.pdf", f, "application/pdf")}, + headers={"X-API-Key": "test-api-key"}, + ) + + assert response.status_code == 200 + data = response.json() + assert data["status"] == "accepted" + assert data["request_id"] == "test-request-id" + assert "poll_url" in data + + def test_submit_missing_api_key(self, client): + """Test submission without API key.""" + with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f: + f.write(b"fake pdf content") + f.seek(0) + + response = client.post( + "/api/v1/async/submit", + files={"file": ("test.pdf", f, "application/pdf")}, + ) + + assert response.status_code == 401 + assert "X-API-Key" in response.json()["detail"] + + def test_submit_invalid_api_key(self, client, mock_db): + """Test submission with invalid API key.""" + mock_db.is_valid_api_key.return_value = False + + with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f: + f.write(b"fake pdf content") + f.seek(0) + + response = client.post( + "/api/v1/async/submit", + files={"file": ("test.pdf", f, "application/pdf")}, + headers={"X-API-Key": "invalid-key"}, + ) + + assert response.status_code == 401 + + def test_submit_unsupported_file_type(self, client): + """Test submission with unsupported file type.""" + with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as f: + f.write(b"text content") + f.seek(0) + + response = client.post( + "/api/v1/async/submit", + files={"file": ("test.txt", f, "text/plain")}, + headers={"X-API-Key": "test-api-key"}, + ) + + assert response.status_code == 400 + assert "Unsupported file type" in response.json()["detail"] + + def test_submit_rate_limited(self, client, mock_rate_limiter): + """Test submission when rate limited.""" + mock_rate_limiter.check_submit_limit.return_value = RateLimitStatus( + allowed=False, + remaining_requests=0, + reset_at=datetime.utcnow() + timedelta(seconds=30), + retry_after_seconds=30, + reason="Rate limit exceeded", + ) + mock_rate_limiter.get_rate_limit_headers.return_value = {"Retry-After": "30"} + + with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f: + f.write(b"fake pdf content") + f.seek(0) + + response = client.post( + "/api/v1/async/submit", + files={"file": ("test.pdf", f, "application/pdf")}, + headers={"X-API-Key": "test-api-key"}, + ) + + assert response.status_code == 429 + assert "Retry-After" in response.headers + + def test_submit_queue_full(self, client, mock_async_service): + """Test submission when queue is full.""" + mock_async_service.submit_request.return_value = AsyncSubmitResult( + success=False, + request_id="test-id", + error="Processing queue is full", + ) + + with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f: + f.write(b"fake pdf content") + f.seek(0) + + response = client.post( + "/api/v1/async/submit", + files={"file": ("test.pdf", f, "application/pdf")}, + headers={"X-API-Key": "test-api-key"}, + ) + + assert response.status_code == 503 + + +class TestAsyncStatusEndpoint: + """Tests for GET /api/v1/async/status/{request_id}.""" + + def test_get_status_pending(self, client, mock_db, sample_async_request): + """Test getting status of pending request.""" + mock_db.get_request_by_api_key.return_value = sample_async_request + mock_db.get_queue_position.return_value = 3 + + response = client.get( + "/api/v1/async/status/550e8400-e29b-41d4-a716-446655440000", + headers={"X-API-Key": "test-api-key"}, + ) + + assert response.status_code == 200 + data = response.json() + assert data["status"] == "pending" + assert data["position_in_queue"] == 3 + assert data["result_url"] is None + + def test_get_status_completed(self, client, mock_db, sample_async_request): + """Test getting status of completed request.""" + sample_async_request.status = "completed" + sample_async_request.completed_at = datetime.utcnow() + mock_db.get_request_by_api_key.return_value = sample_async_request + + response = client.get( + "/api/v1/async/status/550e8400-e29b-41d4-a716-446655440000", + headers={"X-API-Key": "test-api-key"}, + ) + + assert response.status_code == 200 + data = response.json() + assert data["status"] == "completed" + assert data["result_url"] is not None + + def test_get_status_not_found(self, client, mock_db): + """Test getting status of non-existent request.""" + mock_db.get_request_by_api_key.return_value = None + + response = client.get( + "/api/v1/async/status/00000000-0000-0000-0000-000000000000", + headers={"X-API-Key": "test-api-key"}, + ) + + assert response.status_code == 404 + + def test_get_status_wrong_api_key(self, client, mock_db, sample_async_request): + """Test that requests are isolated by API key.""" + # Request belongs to different API key + mock_db.get_request_by_api_key.return_value = None + + response = client.get( + "/api/v1/async/status/550e8400-e29b-41d4-a716-446655440000", + headers={"X-API-Key": "different-api-key"}, + ) + + assert response.status_code == 404 + + +class TestAsyncResultEndpoint: + """Tests for GET /api/v1/async/result/{request_id}.""" + + def test_get_result_completed(self, client, mock_db, sample_async_request): + """Test getting result of completed request.""" + sample_async_request.status = "completed" + sample_async_request.completed_at = datetime.utcnow() + sample_async_request.processing_time_ms = 1234.5 + sample_async_request.result = { + "document_id": "test-doc", + "success": True, + "document_type": "invoice", + "fields": {"InvoiceNumber": "12345"}, + "confidence": {"InvoiceNumber": 0.95}, + "detections": [], + "errors": [], + } + mock_db.get_request_by_api_key.return_value = sample_async_request + + response = client.get( + "/api/v1/async/result/550e8400-e29b-41d4-a716-446655440000", + headers={"X-API-Key": "test-api-key"}, + ) + + assert response.status_code == 200 + data = response.json() + assert data["status"] == "completed" + assert data["result"] is not None + assert data["result"]["fields"]["InvoiceNumber"] == "12345" + + def test_get_result_not_completed(self, client, mock_db, sample_async_request): + """Test getting result of pending request.""" + mock_db.get_request_by_api_key.return_value = sample_async_request + + response = client.get( + "/api/v1/async/result/550e8400-e29b-41d4-a716-446655440000", + headers={"X-API-Key": "test-api-key"}, + ) + + assert response.status_code == 409 + assert "not yet completed" in response.json()["detail"] + + def test_get_result_failed(self, client, mock_db, sample_async_request): + """Test getting result of failed request.""" + sample_async_request.status = "failed" + sample_async_request.error_message = "Processing failed" + sample_async_request.processing_time_ms = 500.0 + mock_db.get_request_by_api_key.return_value = sample_async_request + + response = client.get( + "/api/v1/async/result/550e8400-e29b-41d4-a716-446655440000", + headers={"X-API-Key": "test-api-key"}, + ) + + assert response.status_code == 200 + data = response.json() + assert data["status"] == "failed" + + +class TestAsyncListEndpoint: + """Tests for GET /api/v1/async/requests.""" + + def test_list_requests(self, client, mock_db, sample_async_request): + """Test listing requests.""" + mock_db.get_requests_by_api_key.return_value = ([sample_async_request], 1) + + response = client.get( + "/api/v1/async/requests", + headers={"X-API-Key": "test-api-key"}, + ) + + assert response.status_code == 200 + data = response.json() + assert data["total"] == 1 + assert len(data["requests"]) == 1 + + def test_list_requests_with_status_filter(self, client, mock_db): + """Test listing requests with status filter.""" + mock_db.get_requests_by_api_key.return_value = ([], 0) + + response = client.get( + "/api/v1/async/requests?status=completed", + headers={"X-API-Key": "test-api-key"}, + ) + + assert response.status_code == 200 + mock_db.get_requests_by_api_key.assert_called_once() + call_kwargs = mock_db.get_requests_by_api_key.call_args[1] + assert call_kwargs["status"] == "completed" + + def test_list_requests_pagination(self, client, mock_db): + """Test listing requests with pagination.""" + mock_db.get_requests_by_api_key.return_value = ([], 0) + + response = client.get( + "/api/v1/async/requests?limit=50&offset=10", + headers={"X-API-Key": "test-api-key"}, + ) + + assert response.status_code == 200 + call_kwargs = mock_db.get_requests_by_api_key.call_args[1] + assert call_kwargs["limit"] == 50 + assert call_kwargs["offset"] == 10 + + def test_list_requests_invalid_status(self, client, mock_db): + """Test listing with invalid status filter.""" + response = client.get( + "/api/v1/async/requests?status=invalid", + headers={"X-API-Key": "test-api-key"}, + ) + + assert response.status_code == 400 + + +class TestAsyncDeleteEndpoint: + """Tests for DELETE /api/v1/async/requests/{request_id}.""" + + def test_delete_pending_request(self, client, mock_db, sample_async_request): + """Test deleting a pending request.""" + mock_db.get_request_by_api_key.return_value = sample_async_request + + response = client.delete( + "/api/v1/async/requests/550e8400-e29b-41d4-a716-446655440000", + headers={"X-API-Key": "test-api-key"}, + ) + + assert response.status_code == 200 + assert response.json()["status"] == "deleted" + + def test_delete_processing_request(self, client, mock_db, sample_async_request): + """Test that processing requests cannot be deleted.""" + sample_async_request.status = "processing" + mock_db.get_request_by_api_key.return_value = sample_async_request + + response = client.delete( + "/api/v1/async/requests/550e8400-e29b-41d4-a716-446655440000", + headers={"X-API-Key": "test-api-key"}, + ) + + assert response.status_code == 409 + + def test_delete_not_found(self, client, mock_db): + """Test deleting non-existent request.""" + mock_db.get_request_by_api_key.return_value = None + + response = client.delete( + "/api/v1/async/requests/00000000-0000-0000-0000-000000000000", + headers={"X-API-Key": "test-api-key"}, + ) + + assert response.status_code == 404 diff --git a/tests/web/test_async_service.py b/tests/web/test_async_service.py new file mode 100644 index 0000000..ec8071c --- /dev/null +++ b/tests/web/test_async_service.py @@ -0,0 +1,266 @@ +""" +Tests for the AsyncProcessingService class. +""" + +import tempfile +import time +from datetime import datetime, timedelta +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from src.data.async_request_db import AsyncRequest +from src.web.workers.async_queue import AsyncTask, AsyncTaskQueue +from src.web.services.async_processing import AsyncProcessingService, AsyncSubmitResult +from src.web.config import AsyncConfig, StorageConfig +from src.web.rate_limiter import RateLimiter + + +@pytest.fixture +def async_service(mock_db, mock_inference_service, rate_limiter, storage_config): + """Create an AsyncProcessingService for testing.""" + with tempfile.TemporaryDirectory() as tmpdir: + async_config = AsyncConfig( + queue_max_size=10, + worker_count=1, + task_timeout_seconds=30, + result_retention_days=7, + temp_upload_dir=Path(tmpdir) / "async", + max_file_size_mb=10, + ) + + queue = AsyncTaskQueue(max_size=10, worker_count=1) + + service = AsyncProcessingService( + inference_service=mock_inference_service, + db=mock_db, + queue=queue, + rate_limiter=rate_limiter, + async_config=async_config, + storage_config=storage_config, + ) + + yield service + + # Cleanup + if service._queue._started: + service.stop() + + +class TestAsyncProcessingService: + """Tests for AsyncProcessingService.""" + + def test_submit_request_success(self, async_service, mock_db): + """Test successful request submission.""" + mock_db.create_request.return_value = "test-request-id" + + result = async_service.submit_request( + api_key="test-api-key", + file_content=b"fake pdf content", + filename="test.pdf", + content_type="application/pdf", + ) + + assert result.success is True + assert result.request_id is not None + assert result.estimated_wait_seconds >= 0 + assert result.error is None + + def test_submit_request_creates_db_record(self, async_service, mock_db): + """Test that submission creates database record.""" + async_service.submit_request( + api_key="test-api-key", + file_content=b"fake pdf content", + filename="test.pdf", + content_type="application/pdf", + ) + + mock_db.create_request.assert_called_once() + call_kwargs = mock_db.create_request.call_args[1] + assert call_kwargs["api_key"] == "test-api-key" + assert call_kwargs["filename"] == "test.pdf" + assert call_kwargs["content_type"] == "application/pdf" + + def test_submit_request_saves_file(self, async_service, mock_db): + """Test that submission saves file to temp directory.""" + content = b"fake pdf content" + + result = async_service.submit_request( + api_key="test-api-key", + file_content=content, + filename="test.pdf", + content_type="application/pdf", + ) + + # File should exist in temp directory + temp_dir = async_service._async_config.temp_upload_dir + files = list(temp_dir.iterdir()) + + # Note: file may be cleaned up quickly if queue processes it + # So we just check that the operation succeeded + assert result.success is True + + def test_submit_request_records_rate_limit(self, async_service, mock_db, rate_limiter): + """Test that submission records rate limit event.""" + async_service.submit_request( + api_key="test-api-key", + file_content=b"fake pdf content", + filename="test.pdf", + content_type="application/pdf", + ) + + # Rate limiter should have recorded the request + mock_db.record_rate_limit_event.assert_called() + + def test_start_and_stop(self, async_service): + """Test starting and stopping the service.""" + async_service.start() + + assert async_service._queue._started is True + assert async_service._cleanup_thread is not None + assert async_service._cleanup_thread.is_alive() + + async_service.stop() + + assert async_service._queue._started is False + + def test_process_task_success(self, async_service, mock_db, mock_inference_service, sample_task): + """Test successful task processing.""" + async_service._process_task(sample_task) + + # Should update status to processing + mock_db.update_status.assert_called_with(sample_task.request_id, "processing") + + # Should complete the request + mock_db.complete_request.assert_called_once() + call_kwargs = mock_db.complete_request.call_args[1] + assert call_kwargs["request_id"] == sample_task.request_id + assert "document_id" in call_kwargs + + def test_process_task_pdf(self, async_service, mock_db, mock_inference_service, sample_task): + """Test processing a PDF task.""" + async_service._process_task(sample_task) + + # Should call process_pdf for .pdf files + mock_inference_service.process_pdf.assert_called_once() + + def test_process_task_image(self, async_service, mock_db, mock_inference_service): + """Test processing an image task.""" + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: + f.write(b"fake image content") + task = AsyncTask( + request_id="image-task", + api_key="test-api-key", + file_path=Path(f.name), + filename="test.png", + ) + + async_service._process_task(task) + + # Should call process_image for image files + mock_inference_service.process_image.assert_called_once() + + def test_process_task_failure(self, async_service, mock_db, mock_inference_service, sample_task): + """Test task processing failure.""" + mock_inference_service.process_pdf.side_effect = Exception("Processing failed") + + async_service._process_task(sample_task) + + # Should update status to failed + mock_db.update_status.assert_called() + last_call = mock_db.update_status.call_args_list[-1] + assert last_call[0][1] == "failed" # status + assert "Processing failed" in last_call[1]["error_message"] + + def test_process_task_file_not_found(self, async_service, mock_db): + """Test task processing with missing file.""" + task = AsyncTask( + request_id="missing-file-task", + api_key="test-api-key", + file_path=Path("/nonexistent/file.pdf"), + filename="test.pdf", + ) + + async_service._process_task(task) + + # Should fail with file not found + mock_db.update_status.assert_called() + last_call = mock_db.update_status.call_args_list[-1] + assert last_call[0][1] == "failed" + + def test_process_task_cleans_up_file(self, async_service, mock_db, mock_inference_service): + """Test that task processing cleans up the uploaded file.""" + with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f: + f.write(b"fake pdf content") + file_path = Path(f.name) + + task = AsyncTask( + request_id="cleanup-task", + api_key="test-api-key", + file_path=file_path, + filename="test.pdf", + ) + + async_service._process_task(task) + + # File should be deleted + assert not file_path.exists() + + def test_estimate_wait(self, async_service): + """Test wait time estimation.""" + # Empty queue + wait = async_service._estimate_wait() + assert wait == 0 + + def test_cleanup_orphan_files(self, async_service, mock_db): + """Test cleanup of orphan files.""" + # Create an orphan file + temp_dir = async_service._async_config.temp_upload_dir + orphan_file = temp_dir / "orphan-request.pdf" + orphan_file.write_bytes(b"orphan content") + + # Set file mtime to old + import os + old_time = time.time() - 7200 + os.utime(orphan_file, (old_time, old_time)) + + # Mock database to say file doesn't exist + mock_db.get_request.return_value = None + + count = async_service._cleanup_orphan_files() + + assert count == 1 + assert not orphan_file.exists() + + def test_save_upload(self, async_service): + """Test saving uploaded file.""" + content = b"test content" + + file_path = async_service._save_upload( + request_id="test-save", + filename="test.pdf", + content=content, + ) + + assert file_path.exists() + assert file_path.read_bytes() == content + assert file_path.suffix == ".pdf" + + # Cleanup + file_path.unlink() + + def test_save_upload_preserves_extension(self, async_service): + """Test that save_upload preserves file extension.""" + content = b"test content" + + # Test various extensions + for ext in [".pdf", ".png", ".jpg", ".jpeg"]: + file_path = async_service._save_upload( + request_id=f"test-{ext}", + filename=f"test{ext}", + content=content, + ) + + assert file_path.suffix == ext + file_path.unlink() diff --git a/tests/web/test_autolabel_with_locks.py b/tests/web/test_autolabel_with_locks.py new file mode 100644 index 0000000..dfc400a --- /dev/null +++ b/tests/web/test_autolabel_with_locks.py @@ -0,0 +1,250 @@ +""" +Tests for Auto-Label Service with Annotation Lock Integration (Phase 3.5). +""" + +import pytest +from datetime import datetime, timedelta, timezone +from pathlib import Path +from unittest.mock import Mock, MagicMock +from uuid import uuid4 + +from src.web.services.autolabel import AutoLabelService +from src.data.admin_db import AdminDB + + +class MockDocument: + """Mock document for testing.""" + + def __init__(self, document_id, annotation_lock_until=None): + self.document_id = document_id + self.annotation_lock_until = annotation_lock_until + self.status = "pending" + self.auto_label_status = None + self.auto_label_error = None + + +class MockAdminDB: + """Mock AdminDB for testing.""" + + def __init__(self): + self.documents = {} + self.annotations = [] + self.status_updates = [] + + def get_document(self, document_id): + """Get document by ID.""" + return self.documents.get(str(document_id)) + + def update_document_status( + self, + document_id, + status=None, + auto_label_status=None, + auto_label_error=None, + ): + """Mock status update.""" + self.status_updates.append({ + "document_id": document_id, + "status": status, + "auto_label_status": auto_label_status, + "auto_label_error": auto_label_error, + }) + doc = self.documents.get(str(document_id)) + if doc: + if status: + doc.status = status + if auto_label_status: + doc.auto_label_status = auto_label_status + if auto_label_error: + doc.auto_label_error = auto_label_error + + def delete_annotations_for_document(self, document_id, source=None): + """Mock delete annotations.""" + return 0 + + def create_annotations_batch(self, annotations): + """Mock create annotations.""" + self.annotations.extend(annotations) + + +@pytest.fixture +def mock_db(): + """Create mock admin DB.""" + return MockAdminDB() + + +@pytest.fixture +def auto_label_service(monkeypatch): + """Create auto-label service with mocked image processing.""" + service = AutoLabelService() + # Mock the OCR engine to avoid dependencies + service._ocr_engine = Mock() + service._ocr_engine.extract_from_image = Mock(return_value=[]) + + # Mock the image processing methods to avoid file I/O errors + def mock_process_image(self, document_id, image_path, field_values, db, page_number=1): + return 0 # No annotations created (mocked) + + monkeypatch.setattr(AutoLabelService, "_process_image", mock_process_image) + + return service + + +class TestAutoLabelWithLocks: + """Tests for auto-label service with lock integration.""" + + def test_auto_label_unlocked_document_succeeds(self, auto_label_service, mock_db, tmp_path): + """Test auto-labeling succeeds on unlocked document.""" + # Create test document (unlocked) + document_id = str(uuid4()) + mock_db.documents[document_id] = MockDocument( + document_id=document_id, + annotation_lock_until=None, + ) + + # Create dummy file + test_file = tmp_path / "test.png" + test_file.write_text("dummy") + + # Attempt auto-label + result = auto_label_service.auto_label_document( + document_id=document_id, + file_path=str(test_file), + field_values={"invoice_number": "INV-001"}, + db=mock_db, + ) + + # Should succeed + assert result["status"] == "completed" + # Verify status was updated to running and then completed + assert len(mock_db.status_updates) >= 2 + assert mock_db.status_updates[0]["auto_label_status"] == "running" + + def test_auto_label_locked_document_fails(self, auto_label_service, mock_db, tmp_path): + """Test auto-labeling fails on locked document.""" + # Create test document (locked for 1 hour) + document_id = str(uuid4()) + lock_until = datetime.now(timezone.utc) + timedelta(hours=1) + mock_db.documents[document_id] = MockDocument( + document_id=document_id, + annotation_lock_until=lock_until, + ) + + # Create dummy file + test_file = tmp_path / "test.png" + test_file.write_text("dummy") + + # Attempt auto-label (should fail) + result = auto_label_service.auto_label_document( + document_id=document_id, + file_path=str(test_file), + field_values={"invoice_number": "INV-001"}, + db=mock_db, + ) + + # Should fail + assert result["status"] == "failed" + assert "locked for annotation" in result["error"] + assert result["annotations_created"] == 0 + + # Verify status was updated to failed + assert any( + update["auto_label_status"] == "failed" + for update in mock_db.status_updates + ) + + def test_auto_label_expired_lock_succeeds(self, auto_label_service, mock_db, tmp_path): + """Test auto-labeling succeeds when lock has expired.""" + # Create test document (lock expired 1 hour ago) + document_id = str(uuid4()) + lock_until = datetime.now(timezone.utc) - timedelta(hours=1) + mock_db.documents[document_id] = MockDocument( + document_id=document_id, + annotation_lock_until=lock_until, + ) + + # Create dummy file + test_file = tmp_path / "test.png" + test_file.write_text("dummy") + + # Attempt auto-label + result = auto_label_service.auto_label_document( + document_id=document_id, + file_path=str(test_file), + field_values={"invoice_number": "INV-001"}, + db=mock_db, + ) + + # Should succeed (lock expired) + assert result["status"] == "completed" + + def test_auto_label_skip_lock_check(self, auto_label_service, mock_db, tmp_path): + """Test auto-labeling with skip_lock_check=True bypasses lock.""" + # Create test document (locked) + document_id = str(uuid4()) + lock_until = datetime.now(timezone.utc) + timedelta(hours=1) + mock_db.documents[document_id] = MockDocument( + document_id=document_id, + annotation_lock_until=lock_until, + ) + + # Create dummy file + test_file = tmp_path / "test.png" + test_file.write_text("dummy") + + # Attempt auto-label with skip_lock_check=True + result = auto_label_service.auto_label_document( + document_id=document_id, + file_path=str(test_file), + field_values={"invoice_number": "INV-001"}, + db=mock_db, + skip_lock_check=True, # Bypass lock check + ) + + # Should succeed even though document is locked + assert result["status"] == "completed" + + def test_auto_label_document_not_found(self, auto_label_service, mock_db, tmp_path): + """Test auto-labeling fails when document doesn't exist.""" + # Create dummy file + test_file = tmp_path / "test.png" + test_file.write_text("dummy") + + # Attempt auto-label on non-existent document + result = auto_label_service.auto_label_document( + document_id=str(uuid4()), + file_path=str(test_file), + field_values={"invoice_number": "INV-001"}, + db=mock_db, + ) + + # Should fail + assert result["status"] == "failed" + assert "not found" in result["error"] + + def test_auto_label_respects_lock_by_default(self, auto_label_service, mock_db, tmp_path): + """Test that lock check is enabled by default.""" + # Create test document (locked) + document_id = str(uuid4()) + lock_until = datetime.now(timezone.utc) + timedelta(minutes=30) + mock_db.documents[document_id] = MockDocument( + document_id=document_id, + annotation_lock_until=lock_until, + ) + + # Create dummy file + test_file = tmp_path / "test.png" + test_file.write_text("dummy") + + # Call without explicit skip_lock_check (defaults to False) + result = auto_label_service.auto_label_document( + document_id=document_id, + file_path=str(test_file), + field_values={"invoice_number": "INV-001"}, + db=mock_db, + # skip_lock_check not specified, should default to False + ) + + # Should fail due to lock + assert result["status"] == "failed" + assert "locked" in result["error"].lower() diff --git a/tests/web/test_batch_queue.py b/tests/web/test_batch_queue.py new file mode 100644 index 0000000..e619313 --- /dev/null +++ b/tests/web/test_batch_queue.py @@ -0,0 +1,282 @@ +""" +Tests for Batch Upload Queue +""" + +import time +from datetime import datetime +from threading import Event +from uuid import uuid4 + +import pytest + +from src.web.workers.batch_queue import BatchTask, BatchTaskQueue + + +class MockBatchService: + """Mock batch upload service for testing.""" + + def __init__(self): + self.processed_tasks = [] + self.process_delay = 0.1 # Simulate processing time + self.should_fail = False + + def process_zip_upload(self, admin_token, zip_filename, zip_content, upload_source): + """Mock process_zip_upload method.""" + if self.should_fail: + raise Exception("Simulated processing error") + + time.sleep(self.process_delay) # Simulate work + + self.processed_tasks.append({ + "admin_token": admin_token, + "zip_filename": zip_filename, + "upload_source": upload_source, + }) + + return { + "status": "completed", + "successful_files": 1, + "failed_files": 0, + } + + +class TestBatchTask: + """Tests for BatchTask dataclass.""" + + def test_batch_task_creation(self): + """BatchTask can be created with required fields.""" + task = BatchTask( + batch_id=uuid4(), + admin_token="test-token", + zip_content=b"test", + zip_filename="test.zip", + upload_source="ui", + auto_label=True, + created_at=datetime.utcnow(), + ) + + assert task.batch_id is not None + assert task.admin_token == "test-token" + assert task.zip_filename == "test.zip" + assert task.upload_source == "ui" + assert task.auto_label is True + + +class TestBatchTaskQueue: + """Tests for batch task queue functionality.""" + + def test_queue_initialization(self): + """Queue initializes with correct defaults.""" + queue = BatchTaskQueue(max_size=10, worker_count=1) + + assert queue.get_queue_depth() == 0 + assert queue.is_running is False + assert queue._worker_count == 1 + + def test_start_queue(self): + """Queue starts with batch service.""" + queue = BatchTaskQueue(max_size=10, worker_count=1) + service = MockBatchService() + + queue.start(service) + + assert queue.is_running is True + assert len(queue._workers) == 1 + + queue.stop() + + def test_stop_queue(self): + """Queue stops gracefully.""" + queue = BatchTaskQueue(max_size=10, worker_count=1) + service = MockBatchService() + + queue.start(service) + assert queue.is_running is True + + queue.stop(timeout=5.0) + + assert queue.is_running is False + assert len(queue._workers) == 0 + + def test_submit_task_success(self): + """Task is submitted to queue successfully.""" + queue = BatchTaskQueue(max_size=10, worker_count=1) + + task = BatchTask( + batch_id=uuid4(), + admin_token="test-token", + zip_content=b"test", + zip_filename="test.zip", + upload_source="ui", + auto_label=True, + created_at=datetime.utcnow(), + ) + + result = queue.submit(task) + + assert result is True + assert queue.get_queue_depth() == 1 + + def test_submit_task_queue_full(self): + """Returns False when queue is full.""" + queue = BatchTaskQueue(max_size=2, worker_count=1) + + # Fill the queue + for i in range(2): + task = BatchTask( + batch_id=uuid4(), + admin_token="test-token", + zip_content=b"test", + zip_filename=f"test{i}.zip", + upload_source="ui", + auto_label=True, + created_at=datetime.utcnow(), + ) + assert queue.submit(task) is True + + # Try to add one more (should fail) + extra_task = BatchTask( + batch_id=uuid4(), + admin_token="test-token", + zip_content=b"test", + zip_filename="extra.zip", + upload_source="ui", + auto_label=True, + created_at=datetime.utcnow(), + ) + + result = queue.submit(extra_task) + + assert result is False + assert queue.get_queue_depth() == 2 + + def test_worker_processes_task(self): + """Worker thread processes queued tasks.""" + queue = BatchTaskQueue(max_size=10, worker_count=1) + service = MockBatchService() + + queue.start(service) + + task = BatchTask( + batch_id=uuid4(), + admin_token="test-token", + zip_content=b"test", + zip_filename="test.zip", + upload_source="ui", + auto_label=True, + created_at=datetime.utcnow(), + ) + + queue.submit(task) + + # Wait for processing + time.sleep(0.5) + + assert len(service.processed_tasks) == 1 + assert service.processed_tasks[0]["zip_filename"] == "test.zip" + + queue.stop() + + def test_multiple_tasks_processed(self): + """Multiple tasks are processed in order.""" + queue = BatchTaskQueue(max_size=10, worker_count=1) + service = MockBatchService() + + queue.start(service) + + # Submit multiple tasks + for i in range(3): + task = BatchTask( + batch_id=uuid4(), + admin_token="test-token", + zip_content=b"test", + zip_filename=f"test{i}.zip", + upload_source="ui", + auto_label=True, + created_at=datetime.utcnow(), + ) + queue.submit(task) + + # Wait for all to process + time.sleep(1.0) + + assert len(service.processed_tasks) == 3 + + queue.stop() + + def test_get_queue_depth(self): + """Returns correct queue depth.""" + queue = BatchTaskQueue(max_size=10, worker_count=1) + + assert queue.get_queue_depth() == 0 + + # Add tasks + for i in range(3): + task = BatchTask( + batch_id=uuid4(), + admin_token="test-token", + zip_content=b"test", + zip_filename=f"test{i}.zip", + upload_source="ui", + auto_label=True, + created_at=datetime.utcnow(), + ) + queue.submit(task) + + assert queue.get_queue_depth() == 3 + + def test_is_running_property(self): + """is_running reflects queue state.""" + queue = BatchTaskQueue(max_size=10, worker_count=1) + service = MockBatchService() + + assert queue.is_running is False + + queue.start(service) + assert queue.is_running is True + + queue.stop() + assert queue.is_running is False + + def test_double_start_ignored(self): + """Starting queue twice is safely ignored.""" + queue = BatchTaskQueue(max_size=10, worker_count=1) + service = MockBatchService() + + queue.start(service) + worker_count_after_first_start = len(queue._workers) + + queue.start(service) # Second start + worker_count_after_second_start = len(queue._workers) + + assert worker_count_after_first_start == worker_count_after_second_start + + queue.stop() + + def test_error_handling_in_worker(self): + """Worker handles processing errors gracefully.""" + queue = BatchTaskQueue(max_size=10, worker_count=1) + service = MockBatchService() + service.should_fail = True # Cause errors + + queue.start(service) + + task = BatchTask( + batch_id=uuid4(), + admin_token="test-token", + zip_content=b"test", + zip_filename="test.zip", + upload_source="ui", + auto_label=True, + created_at=datetime.utcnow(), + ) + + queue.submit(task) + + # Wait for processing attempt + time.sleep(0.5) + + # Worker should still be running + assert queue.is_running is True + + queue.stop() diff --git a/tests/web/test_batch_upload_routes.py b/tests/web/test_batch_upload_routes.py new file mode 100644 index 0000000..b039688 --- /dev/null +++ b/tests/web/test_batch_upload_routes.py @@ -0,0 +1,368 @@ +""" +Tests for Batch Upload Routes +""" + +import io +import zipfile +from datetime import datetime +from uuid import uuid4 + +import pytest +from fastapi import FastAPI +from fastapi.testclient import TestClient + +from src.web.api.v1.batch.routes import router +from src.web.core.auth import validate_admin_token, get_admin_db +from src.web.workers.batch_queue import init_batch_queue, shutdown_batch_queue +from src.web.services.batch_upload import BatchUploadService + + +class MockAdminDB: + """Mock AdminDB for testing.""" + + def __init__(self): + self.batches = {} + self.batch_files = {} + + def create_batch_upload(self, admin_token, filename, file_size, upload_source): + batch_id = uuid4() + batch = type('BatchUpload', (), { + 'batch_id': batch_id, + 'admin_token': admin_token, + 'filename': filename, + 'file_size': file_size, + 'upload_source': upload_source, + 'status': 'processing', + 'total_files': 0, + 'processed_files': 0, + 'successful_files': 0, + 'failed_files': 0, + 'csv_filename': None, + 'csv_row_count': None, + 'error_message': None, + 'created_at': datetime.utcnow(), + 'completed_at': None, + })() + self.batches[batch_id] = batch + return batch + + def update_batch_upload(self, batch_id, **kwargs): + if batch_id in self.batches: + batch = self.batches[batch_id] + for key, value in kwargs.items(): + setattr(batch, key, value) + + def create_batch_upload_file(self, batch_id, filename, **kwargs): + file_id = uuid4() + defaults = { + 'file_id': file_id, + 'batch_id': batch_id, + 'filename': filename, + 'status': 'pending', + 'error_message': None, + 'annotation_count': 0, + 'csv_row_data': None, + } + defaults.update(kwargs) + file_record = type('BatchUploadFile', (), defaults)() + if batch_id not in self.batch_files: + self.batch_files[batch_id] = [] + self.batch_files[batch_id].append(file_record) + return file_record + + def update_batch_upload_file(self, file_id, **kwargs): + for files in self.batch_files.values(): + for file_record in files: + if file_record.file_id == file_id: + for key, value in kwargs.items(): + setattr(file_record, key, value) + return + + def get_batch_upload(self, batch_id): + return self.batches.get(batch_id, type('BatchUpload', (), { + 'batch_id': batch_id, + 'admin_token': 'test-token', + 'filename': 'test.zip', + 'status': 'completed', + 'total_files': 2, + 'processed_files': 2, + 'successful_files': 2, + 'failed_files': 0, + 'csv_filename': None, + 'csv_row_count': None, + 'error_message': None, + 'created_at': datetime.utcnow(), + 'completed_at': datetime.utcnow(), + })()) + + def get_batch_upload_files(self, batch_id): + return self.batch_files.get(batch_id, []) + + def get_batch_uploads_by_token(self, admin_token, limit=50, offset=0): + """Get batches filtered by admin token with pagination.""" + token_batches = [b for b in self.batches.values() if b.admin_token == admin_token] + total = len(token_batches) + return token_batches[offset:offset+limit], total + + +@pytest.fixture(scope="class") +def app(): + """Create test FastAPI app with mocked dependencies.""" + app = FastAPI() + + # Create mock admin DB + mock_admin_db = MockAdminDB() + + # Override dependencies + app.dependency_overrides[validate_admin_token] = lambda: "test-token" + app.dependency_overrides[get_admin_db] = lambda: mock_admin_db + + # Initialize batch queue with mock service + batch_service = BatchUploadService(mock_admin_db) + init_batch_queue(batch_service) + + app.include_router(router) + + yield app + + # Cleanup: shutdown batch queue after all tests in class + shutdown_batch_queue() + + +@pytest.fixture +def client(app): + """Create test client.""" + return TestClient(app) + + +def create_test_zip(files): + """Create a test ZIP file.""" + zip_buffer = io.BytesIO() + with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file: + for filename, content in files.items(): + zip_file.writestr(filename, content) + zip_buffer.seek(0) + return zip_buffer + + +class TestBatchUploadRoutes: + """Tests for batch upload API routes.""" + + def test_upload_batch_success(self, client): + """Test successful batch upload (defaults to async mode).""" + files = { + "INV001.pdf": b"%PDF-1.4 test content", + "INV002.pdf": b"%PDF-1.4 test content 2", + } + zip_file = create_test_zip(files) + + response = client.post( + "/api/v1/admin/batch/upload", + files={"file": ("test.zip", zip_file, "application/zip")}, + data={"upload_source": "ui"}, + ) + + # Async mode is default, should return 202 + assert response.status_code == 202 + result = response.json() + assert "batch_id" in result + assert result["status"] == "accepted" + + def test_upload_batch_non_zip_file(self, client): + """Test uploading non-ZIP file.""" + response = client.post( + "/api/v1/admin/batch/upload", + files={"file": ("test.pdf", io.BytesIO(b"test"), "application/pdf")}, + data={"upload_source": "ui"}, + ) + + assert response.status_code == 400 + assert "Only ZIP files" in response.json()["detail"] + + def test_upload_batch_with_csv(self, client): + """Test batch upload with CSV (defaults to async).""" + csv_content = """DocumentId,InvoiceNumber,Amount +INV001,F2024-001,1500.00 +INV002,F2024-002,2500.00 +""" + files = { + "INV001.pdf": b"%PDF-1.4 test", + "INV002.pdf": b"%PDF-1.4 test 2", + "metadata.csv": csv_content.encode('utf-8'), + } + zip_file = create_test_zip(files) + + response = client.post( + "/api/v1/admin/batch/upload", + files={"file": ("batch.zip", zip_file, "application/zip")}, + data={"upload_source": "api"}, + ) + + # Async mode is default, should return 202 + assert response.status_code == 202 + result = response.json() + assert "batch_id" in result + assert result["status"] == "accepted" + + def test_get_batch_status(self, client): + """Test getting batch status.""" + batch_id = str(uuid4()) + response = client.get(f"/api/v1/admin/batch/status/{batch_id}") + + assert response.status_code == 200 + result = response.json() + assert result["batch_id"] == batch_id + assert "status" in result + assert "total_files" in result + + def test_list_batch_uploads(self, client): + """Test listing batch uploads.""" + response = client.get("/api/v1/admin/batch/list") + + assert response.status_code == 200 + result = response.json() + assert "batches" in result + assert "total" in result + assert "limit" in result + assert "offset" in result + + def test_upload_batch_async_mode_default(self, client): + """Test async mode is default (async_mode=True).""" + files = { + "INV001.pdf": b"%PDF-1.4 test content", + } + zip_file = create_test_zip(files) + + response = client.post( + "/api/v1/admin/batch/upload", + files={"file": ("test.zip", zip_file, "application/zip")}, + data={"upload_source": "ui"}, + ) + + # Async mode should return 202 Accepted + assert response.status_code == 202 + result = response.json() + assert result["status"] == "accepted" + assert "batch_id" in result + assert "status_url" in result + assert "queue_depth" in result + assert result["message"] == "Batch upload queued for processing" + + def test_upload_batch_async_mode_explicit(self, client): + """Test explicit async mode (async_mode=True).""" + files = { + "INV001.pdf": b"%PDF-1.4 test content", + } + zip_file = create_test_zip(files) + + response = client.post( + "/api/v1/admin/batch/upload", + files={"file": ("test.zip", zip_file, "application/zip")}, + data={"upload_source": "ui", "async_mode": "true"}, + ) + + assert response.status_code == 202 + result = response.json() + assert result["status"] == "accepted" + assert "batch_id" in result + assert "status_url" in result + + def test_upload_batch_sync_mode(self, client): + """Test sync mode (async_mode=False).""" + files = { + "INV001.pdf": b"%PDF-1.4 test content", + } + zip_file = create_test_zip(files) + + response = client.post( + "/api/v1/admin/batch/upload", + files={"file": ("test.zip", zip_file, "application/zip")}, + data={"upload_source": "ui", "async_mode": "false"}, + ) + + # Sync mode should return 200 OK with full results + assert response.status_code == 200 + result = response.json() + assert "batch_id" in result + assert result["status"] in ["completed", "partial", "failed"] + assert "successful_files" in result + + def test_upload_batch_async_with_auto_label(self, client): + """Test async mode with auto_label flag.""" + files = { + "INV001.pdf": b"%PDF-1.4 test content", + } + zip_file = create_test_zip(files) + + response = client.post( + "/api/v1/admin/batch/upload", + files={"file": ("test.zip", zip_file, "application/zip")}, + data={ + "upload_source": "ui", + "async_mode": "true", + "auto_label": "true", + }, + ) + + assert response.status_code == 202 + result = response.json() + assert result["status"] == "accepted" + assert "batch_id" in result + + def test_upload_batch_async_without_auto_label(self, client): + """Test async mode with auto_label disabled.""" + files = { + "INV001.pdf": b"%PDF-1.4 test content", + } + zip_file = create_test_zip(files) + + response = client.post( + "/api/v1/admin/batch/upload", + files={"file": ("test.zip", zip_file, "application/zip")}, + data={ + "upload_source": "ui", + "async_mode": "true", + "auto_label": "false", + }, + ) + + assert response.status_code == 202 + result = response.json() + assert result["status"] == "accepted" + + def test_upload_batch_queue_full(self, client): + """Test handling queue full scenario.""" + # This test would require mocking the queue to return False on submit + # For now, we verify the endpoint accepts the request + files = { + "INV001.pdf": b"%PDF-1.4 test content", + } + zip_file = create_test_zip(files) + + response = client.post( + "/api/v1/admin/batch/upload", + files={"file": ("test.zip", zip_file, "application/zip")}, + data={"upload_source": "ui", "async_mode": "true"}, + ) + + # Should either accept (202) or reject if queue full (503) + assert response.status_code in [202, 503] + + def test_async_status_url_format(self, client): + """Test async response contains correctly formatted status URL.""" + files = { + "INV001.pdf": b"%PDF-1.4 test content", + } + zip_file = create_test_zip(files) + + response = client.post( + "/api/v1/admin/batch/upload", + files={"file": ("test.zip", zip_file, "application/zip")}, + data={"async_mode": "true"}, + ) + + assert response.status_code == 202 + result = response.json() + batch_id = result["batch_id"] + expected_url = f"/api/v1/admin/batch/status/{batch_id}" + assert result["status_url"] == expected_url diff --git a/tests/web/test_batch_upload_service.py b/tests/web/test_batch_upload_service.py new file mode 100644 index 0000000..102cf3a --- /dev/null +++ b/tests/web/test_batch_upload_service.py @@ -0,0 +1,221 @@ +""" +Tests for Batch Upload Service +""" + +import io +import zipfile +from pathlib import Path +from uuid import uuid4 + +import pytest + +from src.data.admin_db import AdminDB +from src.web.services.batch_upload import BatchUploadService + + +@pytest.fixture +def admin_db(): + """Mock admin database for testing.""" + class MockAdminDB: + def __init__(self): + self.batches = {} + self.batch_files = {} + + def create_batch_upload(self, admin_token, filename, file_size, upload_source): + batch_id = uuid4() + batch = type('BatchUpload', (), { + 'batch_id': batch_id, + 'admin_token': admin_token, + 'filename': filename, + 'file_size': file_size, + 'upload_source': upload_source, + 'status': 'processing', + 'total_files': 0, + 'processed_files': 0, + 'successful_files': 0, + 'failed_files': 0, + 'csv_filename': None, + 'csv_row_count': None, + 'error_message': None, + 'created_at': None, + 'completed_at': None, + })() + self.batches[batch_id] = batch + return batch + + def update_batch_upload(self, batch_id, **kwargs): + if batch_id in self.batches: + batch = self.batches[batch_id] + for key, value in kwargs.items(): + setattr(batch, key, value) + + def create_batch_upload_file(self, batch_id, filename, **kwargs): + file_id = uuid4() + # Set defaults for attributes + defaults = { + 'file_id': file_id, + 'batch_id': batch_id, + 'filename': filename, + 'status': 'pending', + 'error_message': None, + 'annotation_count': 0, + 'csv_row_data': None, + } + defaults.update(kwargs) + file_record = type('BatchUploadFile', (), defaults)() + if batch_id not in self.batch_files: + self.batch_files[batch_id] = [] + self.batch_files[batch_id].append(file_record) + return file_record + + def update_batch_upload_file(self, file_id, **kwargs): + for files in self.batch_files.values(): + for file_record in files: + if file_record.file_id == file_id: + for key, value in kwargs.items(): + setattr(file_record, key, value) + return + + def get_batch_upload(self, batch_id): + return self.batches.get(batch_id) + + def get_batch_upload_files(self, batch_id): + return self.batch_files.get(batch_id, []) + + return MockAdminDB() + + +@pytest.fixture +def batch_service(admin_db): + """Batch upload service instance.""" + return BatchUploadService(admin_db) + + +def create_test_zip(files): + """Create a test ZIP file with given files. + + Args: + files: Dictionary mapping filenames to content bytes + + Returns: + ZIP file content as bytes + """ + zip_buffer = io.BytesIO() + with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file: + for filename, content in files.items(): + zip_file.writestr(filename, content) + return zip_buffer.getvalue() + + +class TestBatchUploadService: + """Tests for BatchUploadService.""" + + def test_process_empty_zip(self, batch_service): + """Test processing an empty ZIP file.""" + zip_content = create_test_zip({}) + result = batch_service.process_zip_upload( + admin_token="test-token", + zip_filename="empty.zip", + zip_content=zip_content, + ) + + assert result["status"] == "failed" + assert "No PDF files" in result.get("error", "") + + def test_process_zip_with_pdfs_only(self, batch_service): + """Test processing ZIP with PDFs but no CSV.""" + files = { + "INV001.pdf": b"%PDF-1.4 test content", + "INV002.pdf": b"%PDF-1.4 test content 2", + } + zip_content = create_test_zip(files) + + result = batch_service.process_zip_upload( + admin_token="test-token", + zip_filename="invoices.zip", + zip_content=zip_content, + ) + + assert result["status"] == "completed" + assert result["total_files"] == 2 + assert result["successful_files"] == 2 + assert result["failed_files"] == 0 + + def test_process_zip_with_csv(self, batch_service): + """Test processing ZIP with PDFs and CSV.""" + csv_content = """DocumentId,InvoiceNumber,Amount,OCR +INV001,F2024-001,1500.00,7350012345678 +INV002,F2024-002,2500.00,7350087654321 +""" + files = { + "INV001.pdf": b"%PDF-1.4 test content", + "INV002.pdf": b"%PDF-1.4 test content 2", + "metadata.csv": csv_content.encode('utf-8'), + } + zip_content = create_test_zip(files) + + result = batch_service.process_zip_upload( + admin_token="test-token", + zip_filename="invoices.zip", + zip_content=zip_content, + ) + + assert result["status"] == "completed" + assert result["total_files"] == 2 + assert result["csv_filename"] == "metadata.csv" + assert result["csv_row_count"] == 2 + + def test_process_invalid_zip(self, batch_service): + """Test processing invalid ZIP file.""" + result = batch_service.process_zip_upload( + admin_token="test-token", + zip_filename="invalid.zip", + zip_content=b"not a zip file", + ) + + assert result["status"] == "failed" + assert "Invalid ZIP file" in result.get("error", "") + + def test_csv_parsing(self, batch_service): + """Test CSV field parsing.""" + csv_content = """DocumentId,InvoiceNumber,InvoiceDate,Amount,OCR,Bankgiro,customer_number +INV001,F2024-001,2024-01-15,1500.00,7350012345678,123-4567,C123 +INV002,F2024-002,2024-01-16,2500.00,7350087654321,123-4567,C124 +""" + zip_file_content = create_test_zip({"metadata.csv": csv_content.encode('utf-8')}) + + with zipfile.ZipFile(io.BytesIO(zip_file_content)) as zip_file: + csv_file_info = [f for f in zip_file.filelist if f.filename.endswith('.csv')][0] + csv_data = batch_service._parse_csv_file(zip_file, csv_file_info) + + assert len(csv_data) == 2 + assert "INV001" in csv_data + assert csv_data["INV001"]["InvoiceNumber"] == "F2024-001" + assert csv_data["INV001"]["Amount"] == "1500.00" + assert csv_data["INV001"]["customer_number"] == "C123" + + def test_get_batch_status(self, batch_service, admin_db): + """Test getting batch upload status.""" + # Create a batch + zip_content = create_test_zip({"INV001.pdf": b"%PDF-1.4 test"}) + result = batch_service.process_zip_upload( + admin_token="test-token", + zip_filename="test.zip", + zip_content=zip_content, + ) + + batch_id = result["batch_id"] + + # Get status + status = batch_service.get_batch_status(batch_id) + + assert status["batch_id"] == batch_id + assert status["filename"] == "test.zip" + assert status["status"] == "completed" + assert status["total_files"] == 1 + assert len(status["files"]) == 1 + + def test_get_batch_status_not_found(self, batch_service): + """Test getting status for non-existent batch.""" + status = batch_service.get_batch_status(str(uuid4())) + assert "error" in status diff --git a/tests/web/test_inference_api.py b/tests/web/test_inference_api.py new file mode 100644 index 0000000..adc0d35 --- /dev/null +++ b/tests/web/test_inference_api.py @@ -0,0 +1,298 @@ +""" +Integration tests for inference API endpoints. + +Tests the /api/v1/infer endpoint to ensure it works end-to-end. +""" + +import pytest +from pathlib import Path +from unittest.mock import Mock, patch +from fastapi.testclient import TestClient +from PIL import Image +import io + +from src.web.app import create_app +from src.web.config import ModelConfig, StorageConfig, AppConfig + + +@pytest.fixture +def test_app(tmp_path): + """Create test FastAPI application.""" + # Setup test directories + upload_dir = tmp_path / "uploads" + result_dir = tmp_path / "results" + upload_dir.mkdir() + result_dir.mkdir() + + # Create test config + app_config = AppConfig( + model=ModelConfig( + model_path=Path("runs/train/invoice_fields/weights/best.pt"), + confidence_threshold=0.5, + use_gpu=False, + dpi=150, + ), + storage=StorageConfig( + upload_dir=upload_dir, + result_dir=result_dir, + allowed_extensions={".pdf", ".png", ".jpg", ".jpeg"}, + max_file_size_mb=50, + ), + ) + + # Create app + app = create_app(app_config) + return app + + +@pytest.fixture +def client(test_app): + """Create test client.""" + return TestClient(test_app) + + +@pytest.fixture +def sample_png_bytes(): + """Create sample PNG image bytes.""" + img = Image.new('RGB', (800, 1200), color='white') + img_bytes = io.BytesIO() + img.save(img_bytes, format='PNG') + img_bytes.seek(0) + return img_bytes + + +class TestHealthEndpoint: + """Test /api/v1/health endpoint.""" + + def test_health_check_returns_200(self, client): + """Test health check returns 200 OK.""" + response = client.get("/api/v1/health") + assert response.status_code == 200 + + def test_health_check_response_structure(self, client): + """Test health check response has correct structure.""" + response = client.get("/api/v1/health") + data = response.json() + + assert "status" in data + assert "model_loaded" in data + assert "gpu_available" in data + assert "version" in data + + assert data["status"] == "healthy" + assert isinstance(data["model_loaded"], bool) + assert isinstance(data["gpu_available"], bool) + + +class TestInferEndpoint: + """Test /api/v1/infer endpoint.""" + + @patch('src.inference.pipeline.InferencePipeline') + @patch('src.inference.yolo_detector.YOLODetector') + def test_infer_accepts_png_file( + self, + mock_yolo_detector, + mock_pipeline, + client, + sample_png_bytes, + ): + """Test that /infer endpoint accepts PNG files.""" + # Setup mocks + mock_detector_instance = Mock() + mock_pipeline_instance = Mock() + mock_yolo_detector.return_value = mock_detector_instance + mock_pipeline.return_value = mock_pipeline_instance + + # Mock pipeline result + mock_result = Mock() + mock_result.fields = {"InvoiceNumber": "12345"} + mock_result.confidence = {"InvoiceNumber": 0.95} + mock_result.success = True + mock_result.errors = [] + mock_result.raw_detections = [] + mock_result.document_id = "test123" + mock_result.document_type = "invoice" + mock_result.processing_time_ms = 100.0 + mock_result.visualization_path = None + mock_result.detections = [] + mock_pipeline_instance.process_image.return_value = mock_result + + # Make request + response = client.post( + "/api/v1/infer", + files={"file": ("test.png", sample_png_bytes, "image/png")}, + ) + + # Verify response + assert response.status_code == 200 + data = response.json() + + assert data["status"] == "success" + assert "result" in data + assert data["result"]["fields"]["InvoiceNumber"] == "12345" + assert data["result"]["confidence"]["InvoiceNumber"] == 0.95 + + def test_infer_rejects_invalid_file_type(self, client): + """Test that /infer rejects unsupported file types.""" + invalid_file = io.BytesIO(b"fake txt content") + + response = client.post( + "/api/v1/infer", + files={"file": ("test.txt", invalid_file, "text/plain")}, + ) + + assert response.status_code == 400 + assert "Unsupported file type" in response.json()["detail"] + + def test_infer_requires_file(self, client): + """Test that /infer requires a file parameter.""" + response = client.post("/api/v1/infer") + + assert response.status_code == 422 # Unprocessable Entity + + @patch('src.inference.pipeline.InferencePipeline') + @patch('src.inference.yolo_detector.YOLODetector') + def test_infer_returns_cross_validation_if_available( + self, + mock_yolo_detector, + mock_pipeline, + client, + sample_png_bytes, + ): + """Test that cross-validation results are included if available.""" + # Setup mocks + mock_detector_instance = Mock() + mock_pipeline_instance = Mock() + mock_yolo_detector.return_value = mock_detector_instance + mock_pipeline.return_value = mock_pipeline_instance + + # Mock pipeline result with cross-validation + mock_result = Mock() + mock_result.fields = { + "InvoiceNumber": "12345", + "OCR": "1234567", + "Amount": "100.00", + } + mock_result.confidence = { + "InvoiceNumber": 0.95, + "OCR": 0.90, + "Amount": 0.88, + } + mock_result.success = True + mock_result.errors = [] + mock_result.raw_detections = [] + mock_result.document_id = "test123" + mock_result.document_type = "invoice" + mock_result.processing_time_ms = 100.0 + mock_result.visualization_path = None + mock_result.detections = [] + + # Add cross-validation result + mock_cv = Mock() + mock_cv.is_valid = True + mock_cv.payment_line_ocr = "1234567" + mock_cv.ocr_match = True + mock_result.cross_validation = mock_cv + + mock_pipeline_instance.process_image.return_value = mock_result + + # Make request + response = client.post( + "/api/v1/infer", + files={"file": ("test.png", sample_png_bytes, "image/png")}, + ) + + # Verify response includes cross-validation + assert response.status_code == 200 + data = response.json() + + # Note: cross_validation is not currently in the response schema + # This test documents that it should be added + + + @patch('src.inference.pipeline.InferencePipeline') + @patch('src.inference.yolo_detector.YOLODetector') + def test_infer_handles_processing_errors_gracefully( + self, + mock_yolo_detector, + mock_pipeline, + client, + sample_png_bytes, + ): + """Test that processing errors are handled gracefully.""" + # Setup mocks + mock_detector_instance = Mock() + mock_pipeline_instance = Mock() + mock_yolo_detector.return_value = mock_detector_instance + mock_pipeline.return_value = mock_pipeline_instance + + # Make pipeline raise an error + mock_pipeline_instance.process_image.side_effect = Exception("Model inference failed") + + # Make request + response = client.post( + "/api/v1/infer", + files={"file": ("test.png", sample_png_bytes, "image/png")}, + ) + + # Verify error handling - service catches exceptions and returns partial results + assert response.status_code == 200 + data = response.json() + assert data["status"] == "partial" + assert data["result"]["success"] is False + assert len(data["result"]["errors"]) > 0 + assert "Model inference failed" in data["result"]["errors"][0] + + +class TestResultsEndpoint: + """Test /api/v1/results/{filename} endpoint.""" + + def test_get_result_image_returns_404_if_not_found(self, client): + """Test that getting non-existent result returns 404.""" + response = client.get("/api/v1/results/nonexistent.png") + assert response.status_code == 404 + + def test_get_result_image_returns_file_if_exists(self, client, test_app, tmp_path): + """Test that existing result file is returned.""" + # Get storage config from app + storage_config = test_app.extra.get("storage_config") + if not storage_config: + pytest.skip("Storage config not available in test app") + + # Create a test result file + result_file = storage_config.result_dir / "test_result.png" + img = Image.new('RGB', (100, 100), color='red') + img.save(result_file) + + # Request the file + response = client.get("/api/v1/results/test_result.png") + + assert response.status_code == 200 + assert response.headers["content-type"] == "image/png" + + +class TestInferenceServiceImports: + """Critical test to catch import errors.""" + + def test_inference_service_can_import_modules(self): + """ + Test that InferenceService can import its dependencies. + + This test will fail if there are ImportError issues like: + - from ..inference.pipeline (wrong relative import) + - from src.web.inference (non-existent module) + + It ensures the imports are correct before runtime. + """ + from src.web.services.inference import InferenceService + + # Import the modules that InferenceService tries to import + from src.inference.pipeline import InferencePipeline + from src.inference.yolo_detector import YOLODetector + from src.pdf.renderer import render_pdf_to_images + + # If we got here, all imports work correctly + assert InferencePipeline is not None + assert YOLODetector is not None + assert render_pdf_to_images is not None + assert InferenceService is not None diff --git a/tests/web/test_inference_service.py b/tests/web/test_inference_service.py new file mode 100644 index 0000000..4aef00b --- /dev/null +++ b/tests/web/test_inference_service.py @@ -0,0 +1,297 @@ +""" +Integration tests for inference service. + +Tests the full initialization and processing flow to catch import errors. +""" + +import pytest +from pathlib import Path +from unittest.mock import Mock, patch +from PIL import Image +import io + +from src.web.services.inference import InferenceService +from src.web.config import ModelConfig, StorageConfig + + +@pytest.fixture +def model_config(tmp_path): + """Create model configuration for testing.""" + return ModelConfig( + model_path=Path("runs/train/invoice_fields/weights/best.pt"), + confidence_threshold=0.5, + use_gpu=False, # Use CPU for tests + dpi=150, + ) + + +@pytest.fixture +def storage_config(tmp_path): + """Create storage configuration for testing.""" + upload_dir = tmp_path / "uploads" + result_dir = tmp_path / "results" + upload_dir.mkdir() + result_dir.mkdir() + + return StorageConfig( + upload_dir=upload_dir, + result_dir=result_dir, + allowed_extensions={".pdf", ".png", ".jpg", ".jpeg"}, + max_file_size_mb=50, + ) + + +@pytest.fixture +def sample_image(tmp_path): + """Create a sample test image.""" + image_path = tmp_path / "test_invoice.png" + img = Image.new('RGB', (800, 1200), color='white') + img.save(image_path) + return image_path + + +@pytest.fixture +def inference_service(model_config, storage_config): + """Create inference service instance.""" + return InferenceService( + model_config=model_config, + storage_config=storage_config, + ) + + +class TestInferenceServiceInitialization: + """Test inference service initialization to catch import errors.""" + + def test_service_creation(self, inference_service): + """Test that service can be created without errors.""" + assert inference_service is not None + assert not inference_service.is_initialized + + def test_gpu_available_check(self, inference_service): + """Test GPU availability check (should not crash).""" + gpu_available = inference_service.gpu_available + assert isinstance(gpu_available, bool) + + @patch('src.inference.pipeline.InferencePipeline') + @patch('src.inference.yolo_detector.YOLODetector') + def test_initialize_imports_correctly( + self, + mock_yolo_detector, + mock_pipeline, + inference_service, + ): + """ + Test that initialize() imports modules correctly. + + This test ensures that the import statements in initialize() + use correct paths and don't fail with ImportError. + """ + # Mock the constructors to avoid actually loading models + mock_detector_instance = Mock() + mock_pipeline_instance = Mock() + mock_yolo_detector.return_value = mock_detector_instance + mock_pipeline.return_value = mock_pipeline_instance + + # Initialize should not raise ImportError + inference_service.initialize() + + # Verify initialization succeeded + assert inference_service.is_initialized + + # Verify imports were called with correct parameters + mock_yolo_detector.assert_called_once() + mock_pipeline.assert_called_once() + + @patch('src.inference.pipeline.InferencePipeline') + @patch('src.inference.yolo_detector.YOLODetector') + def test_initialize_sets_up_pipeline( + self, + mock_yolo_detector, + mock_pipeline, + inference_service, + model_config, + ): + """Test that initialize sets up pipeline with correct config.""" + mock_detector_instance = Mock() + mock_pipeline_instance = Mock() + mock_yolo_detector.return_value = mock_detector_instance + mock_pipeline.return_value = mock_pipeline_instance + + inference_service.initialize() + + # Check YOLO detector was initialized correctly + mock_yolo_detector.assert_called_once_with( + str(model_config.model_path), + confidence_threshold=model_config.confidence_threshold, + device="cpu", # use_gpu=False in fixture + ) + + # Check pipeline was initialized correctly + mock_pipeline.assert_called_once_with( + model_path=str(model_config.model_path), + confidence_threshold=model_config.confidence_threshold, + use_gpu=False, + dpi=150, + enable_fallback=True, + ) + + @patch('src.inference.pipeline.InferencePipeline') + @patch('src.inference.yolo_detector.YOLODetector') + def test_initialize_idempotent( + self, + mock_yolo_detector, + mock_pipeline, + inference_service, + ): + """Test that calling initialize() multiple times is safe.""" + mock_detector_instance = Mock() + mock_pipeline_instance = Mock() + mock_yolo_detector.return_value = mock_detector_instance + mock_pipeline.return_value = mock_pipeline_instance + + # Call initialize twice + inference_service.initialize() + inference_service.initialize() + + # Should only be called once due to is_initialized check + assert mock_yolo_detector.call_count == 1 + assert mock_pipeline.call_count == 1 + + +class TestInferenceServiceProcessing: + """Test inference processing methods.""" + + @patch('src.inference.pipeline.InferencePipeline') + @patch('src.inference.yolo_detector.YOLODetector') + @patch('ultralytics.YOLO') + def test_process_image_basic_flow( + self, + mock_yolo_class, + mock_yolo_detector, + mock_pipeline, + inference_service, + sample_image, + ): + """Test basic image processing flow.""" + # Setup mocks + mock_detector_instance = Mock() + mock_pipeline_instance = Mock() + mock_yolo_detector.return_value = mock_detector_instance + mock_pipeline.return_value = mock_pipeline_instance + + # Mock pipeline result + mock_result = Mock() + mock_result.fields = {"InvoiceNumber": "12345"} + mock_result.confidence = {"InvoiceNumber": 0.95} + mock_result.success = True + mock_result.errors = [] + mock_result.raw_detections = [] + mock_pipeline_instance.process_image.return_value = mock_result + + # Process image + result = inference_service.process_image(sample_image) + + # Verify result + assert result.success + assert result.fields == {"InvoiceNumber": "12345"} + assert result.confidence == {"InvoiceNumber": 0.95} + assert result.processing_time_ms > 0 + + @patch('src.inference.pipeline.InferencePipeline') + @patch('src.inference.yolo_detector.YOLODetector') + def test_process_image_handles_errors( + self, + mock_yolo_detector, + mock_pipeline, + inference_service, + sample_image, + ): + """Test that processing errors are handled gracefully.""" + # Setup mocks + mock_detector_instance = Mock() + mock_pipeline_instance = Mock() + mock_yolo_detector.return_value = mock_detector_instance + mock_pipeline.return_value = mock_pipeline_instance + + # Make pipeline raise an error + mock_pipeline_instance.process_image.side_effect = Exception("Test error") + + # Process should not crash + result = inference_service.process_image(sample_image) + + # Verify error handling + assert not result.success + assert len(result.errors) > 0 + assert "Test error" in result.errors[0] + + +class TestInferenceServicePDFRendering: + """Test PDF rendering imports.""" + + @patch('src.inference.pipeline.InferencePipeline') + @patch('src.inference.yolo_detector.YOLODetector') + @patch('src.pdf.renderer.render_pdf_to_images') + @patch('ultralytics.YOLO') + def test_pdf_visualization_imports_correctly( + self, + mock_yolo_class, + mock_render_pdf, + mock_yolo_detector, + mock_pipeline, + inference_service, + tmp_path, + ): + """ + Test that _save_pdf_visualization imports render_pdf_to_images correctly. + + This catches the import error we had with: + from ..pdf.renderer (wrong) vs from src.pdf.renderer (correct) + """ + # Setup mocks + mock_detector_instance = Mock() + mock_pipeline_instance = Mock() + mock_yolo_detector.return_value = mock_detector_instance + mock_pipeline.return_value = mock_pipeline_instance + + # Create a fake PDF path + pdf_path = tmp_path / "test.pdf" + pdf_path.touch() + + # Mock render_pdf_to_images to return an image + image_bytes = io.BytesIO() + img = Image.new('RGB', (800, 1200), color='white') + img.save(image_bytes, format='PNG') + mock_render_pdf.return_value = [(1, image_bytes.getvalue())] + + # Mock YOLO + mock_model_instance = Mock() + mock_result = Mock() + mock_result.save = Mock() + mock_model_instance.predict.return_value = [mock_result] + mock_yolo_class.return_value = mock_model_instance + + # This should not raise ImportError + result_path = inference_service._save_pdf_visualization(pdf_path, "test123") + + # Verify import was successful + mock_render_pdf.assert_called_once() + assert result_path is not None + + +@pytest.mark.skipif( + not Path("runs/train/invoice_fields/weights/best.pt").exists(), + reason="Model file not available" +) +class TestInferenceServiceRealModel: + """Integration tests with real model (skip if model not available).""" + + def test_real_initialization(self, model_config, storage_config): + """Test real initialization with actual model.""" + service = InferenceService(model_config, storage_config) + + # This should work with the real imports + service.initialize() + + assert service.is_initialized + assert service._pipeline is not None + assert service._detector is not None diff --git a/tests/web/test_rate_limiter.py b/tests/web/test_rate_limiter.py new file mode 100644 index 0000000..5b191ff --- /dev/null +++ b/tests/web/test_rate_limiter.py @@ -0,0 +1,154 @@ +""" +Tests for the RateLimiter class. +""" + +import time +from datetime import datetime, timedelta +from unittest.mock import MagicMock + +import pytest + +from src.data.async_request_db import ApiKeyConfig +from src.web.rate_limiter import RateLimiter, RateLimitConfig, RateLimitStatus + + +class TestRateLimiter: + """Tests for RateLimiter.""" + + def test_check_submit_limit_allowed(self, rate_limiter, mock_db): + """Test that requests are allowed under the limit.""" + status = rate_limiter.check_submit_limit("test-api-key") + + assert status.allowed is True + assert status.remaining_requests >= 0 + assert status.retry_after_seconds is None + + def test_check_submit_limit_rate_exceeded(self, rate_limiter, mock_db): + """Test rate limit exceeded when too many requests.""" + # Record 10 requests (the default limit) + for _ in range(10): + rate_limiter.record_request("test-api-key") + + status = rate_limiter.check_submit_limit("test-api-key") + + assert status.allowed is False + assert status.remaining_requests == 0 + assert status.retry_after_seconds is not None + assert status.retry_after_seconds > 0 + assert "rate limit" in status.reason.lower() + + def test_check_submit_limit_concurrent_jobs_exceeded(self, rate_limiter, mock_db): + """Test rejection when max concurrent jobs reached.""" + # Mock active jobs at the limit + mock_db.count_active_jobs.return_value = 3 # Max is 3 + + status = rate_limiter.check_submit_limit("test-api-key") + + assert status.allowed is False + assert "concurrent" in status.reason.lower() + + def test_record_request(self, rate_limiter, mock_db): + """Test that recording a request works.""" + rate_limiter.record_request("test-api-key") + + # Should have called the database + mock_db.record_rate_limit_event.assert_called_once_with("test-api-key", "request") + + def test_check_poll_limit_allowed(self, rate_limiter, mock_db): + """Test that polling is allowed initially.""" + status = rate_limiter.check_poll_limit("test-api-key", "request-123") + + assert status.allowed is True + + def test_check_poll_limit_too_frequent(self, rate_limiter, mock_db): + """Test that rapid polling is rejected.""" + # First poll should succeed + status1 = rate_limiter.check_poll_limit("test-api-key", "request-123") + assert status1.allowed is True + + # Immediate second poll should fail + status2 = rate_limiter.check_poll_limit("test-api-key", "request-123") + assert status2.allowed is False + assert "polling" in status2.reason.lower() + assert status2.retry_after_seconds is not None + + def test_check_poll_limit_different_requests(self, rate_limiter, mock_db): + """Test that different request_ids have separate poll limits.""" + # Poll request 1 + status1 = rate_limiter.check_poll_limit("test-api-key", "request-1") + assert status1.allowed is True + + # Poll request 2 should also be allowed + status2 = rate_limiter.check_poll_limit("test-api-key", "request-2") + assert status2.allowed is True + + def test_sliding_window_expires(self, rate_limiter, mock_db): + """Test that requests expire from the sliding window.""" + # Record requests + for _ in range(5): + rate_limiter.record_request("test-api-key") + + # Check status - should have 5 remaining + status1 = rate_limiter.check_submit_limit("test-api-key") + assert status1.allowed is True + assert status1.remaining_requests == 4 # 10 - 5 - 1 (for this check) + + def test_get_rate_limit_headers(self, rate_limiter): + """Test rate limit header generation.""" + status = RateLimitStatus( + allowed=False, + remaining_requests=0, + reset_at=datetime.utcnow() + timedelta(seconds=30), + retry_after_seconds=30, + ) + + headers = rate_limiter.get_rate_limit_headers(status) + + assert "X-RateLimit-Remaining" in headers + assert headers["X-RateLimit-Remaining"] == "0" + assert "Retry-After" in headers + assert headers["Retry-After"] == "30" + + def test_cleanup_poll_timestamps(self, rate_limiter, mock_db): + """Test cleanup of old poll timestamps.""" + # Add some poll timestamps + rate_limiter.check_poll_limit("test-api-key", "old-request") + + # Manually age the timestamp + rate_limiter._poll_timestamps[("test-api-key", "old-request")] = time.time() - 7200 + + # Run cleanup with 1 hour max age + cleaned = rate_limiter.cleanup_poll_timestamps(max_age_seconds=3600) + + assert cleaned == 1 + assert ("test-api-key", "old-request") not in rate_limiter._poll_timestamps + + def test_cleanup_request_windows(self, rate_limiter, mock_db): + """Test cleanup of empty request windows.""" + # Add some old requests + rate_limiter._request_windows["old-key"] = [time.time() - 120] + + # Run cleanup + rate_limiter.cleanup_request_windows() + + # Old entries should be removed + assert "old-key" not in rate_limiter._request_windows + + def test_config_caching(self, rate_limiter, mock_db): + """Test that API key configs are cached.""" + # First call should query database + rate_limiter._get_config("test-api-key") + assert mock_db.get_api_key_config.call_count == 1 + + # Second call should use cache + rate_limiter._get_config("test-api-key") + assert mock_db.get_api_key_config.call_count == 1 # Still 1 + + def test_default_config_for_unknown_key(self, rate_limiter, mock_db): + """Test that unknown API keys get default config.""" + mock_db.get_api_key_config.return_value = None + + config = rate_limiter._get_config("unknown-key") + + assert config.requests_per_minute == 10 # Default + assert config.max_concurrent_jobs == 3 # Default diff --git a/tests/web/test_training_phase4.py b/tests/web/test_training_phase4.py new file mode 100644 index 0000000..41ae02d --- /dev/null +++ b/tests/web/test_training_phase4.py @@ -0,0 +1,384 @@ +""" +Tests for Phase 4: Training Data Management +""" + +import pytest +from datetime import datetime +from uuid import uuid4 + +from fastapi import FastAPI +from fastapi.testclient import TestClient + +from src.web.api.v1.admin.training import create_training_router +from src.web.core.auth import validate_admin_token, get_admin_db + + +class MockTrainingTask: + """Mock TrainingTask for testing.""" + + def __init__(self, **kwargs): + self.task_id = kwargs.get('task_id', uuid4()) + self.admin_token = kwargs.get('admin_token', 'test-token') + self.name = kwargs.get('name', 'Test Training') + self.description = kwargs.get('description', None) + self.status = kwargs.get('status', 'completed') + self.task_type = kwargs.get('task_type', 'train') + self.config = kwargs.get('config', {}) + self.scheduled_at = kwargs.get('scheduled_at', None) + self.cron_expression = kwargs.get('cron_expression', None) + self.is_recurring = kwargs.get('is_recurring', False) + self.started_at = kwargs.get('started_at', datetime.utcnow()) + self.completed_at = kwargs.get('completed_at', datetime.utcnow()) + self.error_message = kwargs.get('error_message', None) + self.result_metrics = kwargs.get('result_metrics', {}) + self.model_path = kwargs.get('model_path', 'runs/train/test/weights/best.pt') + self.document_count = kwargs.get('document_count', 0) + self.metrics_mAP = kwargs.get('metrics_mAP', 0.935) + self.metrics_precision = kwargs.get('metrics_precision', 0.92) + self.metrics_recall = kwargs.get('metrics_recall', 0.88) + self.created_at = kwargs.get('created_at', datetime.utcnow()) + self.updated_at = kwargs.get('updated_at', datetime.utcnow()) + + +class MockTrainingDocumentLink: + """Mock TrainingDocumentLink for testing.""" + + def __init__(self, **kwargs): + self.link_id = kwargs.get('link_id', uuid4()) + self.task_id = kwargs.get('task_id') + self.document_id = kwargs.get('document_id') + self.annotation_snapshot = kwargs.get('annotation_snapshot', None) + self.created_at = kwargs.get('created_at', datetime.utcnow()) + + +class MockAdminDocument: + """Mock AdminDocument for testing.""" + + def __init__(self, **kwargs): + self.document_id = kwargs.get('document_id', uuid4()) + self.admin_token = kwargs.get('admin_token', 'test-token') + self.filename = kwargs.get('filename', 'test.pdf') + self.file_size = kwargs.get('file_size', 100000) + self.content_type = kwargs.get('content_type', 'application/pdf') + self.file_path = kwargs.get('file_path', 'data/admin_docs/test.pdf') + self.page_count = kwargs.get('page_count', 1) + self.status = kwargs.get('status', 'labeled') + self.auto_label_status = kwargs.get('auto_label_status', None) + self.auto_label_error = kwargs.get('auto_label_error', None) + self.upload_source = kwargs.get('upload_source', 'ui') + self.batch_id = kwargs.get('batch_id', None) + self.csv_field_values = kwargs.get('csv_field_values', None) + self.auto_label_queued_at = kwargs.get('auto_label_queued_at', None) + self.annotation_lock_until = kwargs.get('annotation_lock_until', None) + self.created_at = kwargs.get('created_at', datetime.utcnow()) + self.updated_at = kwargs.get('updated_at', datetime.utcnow()) + + +class MockAnnotation: + """Mock AdminAnnotation for testing.""" + + def __init__(self, **kwargs): + self.annotation_id = kwargs.get('annotation_id', uuid4()) + self.document_id = kwargs.get('document_id') + self.page_number = kwargs.get('page_number', 1) + self.class_id = kwargs.get('class_id', 0) + self.class_name = kwargs.get('class_name', 'invoice_number') + self.bbox_x = kwargs.get('bbox_x', 100) + self.bbox_y = kwargs.get('bbox_y', 100) + self.bbox_width = kwargs.get('bbox_width', 200) + self.bbox_height = kwargs.get('bbox_height', 50) + self.x_center = kwargs.get('x_center', 0.5) + self.y_center = kwargs.get('y_center', 0.5) + self.width = kwargs.get('width', 0.3) + self.height = kwargs.get('height', 0.1) + self.text_value = kwargs.get('text_value', 'INV-001') + self.confidence = kwargs.get('confidence', 0.95) + self.source = kwargs.get('source', 'manual') + self.is_verified = kwargs.get('is_verified', False) + self.verified_at = kwargs.get('verified_at', None) + self.verified_by = kwargs.get('verified_by', None) + self.override_source = kwargs.get('override_source', None) + self.original_annotation_id = kwargs.get('original_annotation_id', None) + self.created_at = kwargs.get('created_at', datetime.utcnow()) + self.updated_at = kwargs.get('updated_at', datetime.utcnow()) + + +class MockAdminDB: + """Mock AdminDB for testing Phase 4.""" + + def __init__(self): + self.documents = {} + self.annotations = {} + self.training_tasks = {} + self.training_links = {} + + def get_documents_for_training( + self, + admin_token, + status="labeled", + has_annotations=True, + min_annotation_count=None, + exclude_used_in_training=False, + limit=100, + offset=0, + ): + """Get documents for training.""" + # Filter documents by criteria + filtered = [] + for doc in self.documents.values(): + if doc.admin_token != admin_token or doc.status != status: + continue + + # Check annotations + annotations = self.annotations.get(str(doc.document_id), []) + if has_annotations and len(annotations) == 0: + continue + if min_annotation_count and len(annotations) < min_annotation_count: + continue + + # Check if used in training + if exclude_used_in_training: + links = self.training_links.get(str(doc.document_id), []) + if links: + continue + + filtered.append(doc) + + total = len(filtered) + return filtered[offset:offset+limit], total + + def get_annotations_for_document(self, document_id): + """Get annotations for document.""" + return self.annotations.get(str(document_id), []) + + def get_document_training_tasks(self, document_id): + """Get training tasks that used this document.""" + return self.training_links.get(str(document_id), []) + + def get_training_tasks_by_token( + self, + admin_token, + status=None, + limit=20, + offset=0, + ): + """Get training tasks filtered by token.""" + tasks = [t for t in self.training_tasks.values() if t.admin_token == admin_token] + if status: + tasks = [t for t in tasks if t.status == status] + + total = len(tasks) + return tasks[offset:offset+limit], total + + def get_training_task(self, task_id): + """Get training task by ID.""" + return self.training_tasks.get(str(task_id)) + + +@pytest.fixture +def app(): + """Create test FastAPI app.""" + app = FastAPI() + + # Create mock DB + mock_db = MockAdminDB() + + # Add test documents + doc1 = MockAdminDocument( + filename="DOC001.pdf", + status="labeled", + ) + doc2 = MockAdminDocument( + filename="DOC002.pdf", + status="labeled", + ) + doc3 = MockAdminDocument( + filename="DOC003.pdf", + status="labeled", + ) + + mock_db.documents[str(doc1.document_id)] = doc1 + mock_db.documents[str(doc2.document_id)] = doc2 + mock_db.documents[str(doc3.document_id)] = doc3 + + # Add annotations + mock_db.annotations[str(doc1.document_id)] = [ + MockAnnotation(document_id=doc1.document_id, source="manual"), + MockAnnotation(document_id=doc1.document_id, source="auto"), + ] + mock_db.annotations[str(doc2.document_id)] = [ + MockAnnotation(document_id=doc2.document_id, source="auto"), + MockAnnotation(document_id=doc2.document_id, source="auto"), + MockAnnotation(document_id=doc2.document_id, source="auto"), + ] + # doc3 has no annotations + + # Add training tasks + task1 = MockTrainingTask( + name="Training Run 2024-01", + status="completed", + document_count=500, + metrics_mAP=0.935, + metrics_precision=0.92, + metrics_recall=0.88, + ) + task2 = MockTrainingTask( + name="Training Run 2024-02", + status="completed", + document_count=600, + metrics_mAP=0.951, + metrics_precision=0.94, + metrics_recall=0.92, + ) + + mock_db.training_tasks[str(task1.task_id)] = task1 + mock_db.training_tasks[str(task2.task_id)] = task2 + + # Add training links (doc1 used in task1) + link1 = MockTrainingDocumentLink( + task_id=task1.task_id, + document_id=doc1.document_id, + ) + mock_db.training_links[str(doc1.document_id)] = [link1] + + # Override dependencies + app.dependency_overrides[validate_admin_token] = lambda: "test-token" + app.dependency_overrides[get_admin_db] = lambda: mock_db + + # Include router + router = create_training_router() + app.include_router(router) + + return app + + +@pytest.fixture +def client(app): + """Create test client.""" + return TestClient(app) + + +class TestTrainingDocuments: + """Tests for GET /admin/training/documents endpoint.""" + + def test_get_training_documents_success(self, client): + """Test getting documents for training.""" + response = client.get("/admin/training/documents") + + assert response.status_code == 200 + data = response.json() + assert "total" in data + assert "documents" in data + assert data["total"] >= 0 + assert isinstance(data["documents"], list) + + def test_get_training_documents_with_annotations(self, client): + """Test filtering documents with annotations.""" + response = client.get("/admin/training/documents?has_annotations=true") + + assert response.status_code == 200 + data = response.json() + # Should return doc1 and doc2 (both have annotations) + assert data["total"] == 2 + + def test_get_training_documents_min_annotation_count(self, client): + """Test filtering by minimum annotation count.""" + response = client.get("/admin/training/documents?min_annotation_count=3") + + assert response.status_code == 200 + data = response.json() + # Should return only doc2 (has 3 annotations) + assert data["total"] == 1 + + def test_get_training_documents_exclude_used(self, client): + """Test excluding documents already used in training.""" + response = client.get("/admin/training/documents?exclude_used_in_training=true") + + assert response.status_code == 200 + data = response.json() + # Should exclude doc1 (used in training) + assert data["total"] == 1 # Only doc2 (doc3 has no annotations) + + def test_get_training_documents_annotation_sources(self, client): + """Test that annotation sources are included.""" + response = client.get("/admin/training/documents?has_annotations=true") + + assert response.status_code == 200 + data = response.json() + # Check that documents have annotation_sources field + for doc in data["documents"]: + assert "annotation_sources" in doc + assert isinstance(doc["annotation_sources"], dict) + assert "manual" in doc["annotation_sources"] + assert "auto" in doc["annotation_sources"] + + def test_get_training_documents_pagination(self, client): + """Test pagination parameters.""" + response = client.get("/admin/training/documents?limit=1&offset=0") + + assert response.status_code == 200 + data = response.json() + assert data["limit"] == 1 + assert data["offset"] == 0 + assert len(data["documents"]) <= 1 + + +class TestTrainingModels: + """Tests for GET /admin/training/models endpoint.""" + + def test_get_training_models_success(self, client): + """Test getting trained models list.""" + response = client.get("/admin/training/models") + + assert response.status_code == 200 + data = response.json() + assert "total" in data + assert "models" in data + assert data["total"] == 2 + assert len(data["models"]) == 2 + + def test_get_training_models_includes_metrics(self, client): + """Test that models include metrics.""" + response = client.get("/admin/training/models") + + assert response.status_code == 200 + data = response.json() + # Check first model has metrics + model = data["models"][0] + assert "metrics" in model + assert "mAP" in model["metrics"] + assert model["metrics"]["mAP"] is not None + assert "precision" in model["metrics"] + assert "recall" in model["metrics"] + + def test_get_training_models_includes_download_url(self, client): + """Test that completed models have download URLs.""" + response = client.get("/admin/training/models") + + assert response.status_code == 200 + data = response.json() + # Check completed models have download URLs + for model in data["models"]: + if model["status"] == "completed": + assert "download_url" in model + assert model["download_url"] is not None + + def test_get_training_models_filter_by_status(self, client): + """Test filtering models by status.""" + response = client.get("/admin/training/models?status=completed") + + assert response.status_code == 200 + data = response.json() + # All returned models should be completed + for model in data["models"]: + assert model["status"] == "completed" + + def test_get_training_models_pagination(self, client): + """Test pagination for models.""" + response = client.get("/admin/training/models?limit=1&offset=0") + + assert response.status_code == 200 + data = response.json() + assert data["limit"] == 1 + assert data["offset"] == 0 + assert len(data["models"]) == 1 diff --git a/update_test_imports.py b/update_test_imports.py new file mode 100644 index 0000000..7c8d7ed --- /dev/null +++ b/update_test_imports.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +"""Update test imports to use new structure.""" + +import re +from pathlib import Path + +# Import mapping: old -> new +IMPORT_MAPPINGS = { + # Admin routes + r'from src\.web\.admin_routes import': 'from src.web.api.v1.admin.documents import', + r'from src\.web\.admin_annotation_routes import': 'from src.web.api.v1.admin.annotations import', + r'from src\.web\.admin_training_routes import': 'from src.web.api.v1.admin.training import', + + # Auth and core + r'from src\.web\.admin_auth import': 'from src.web.core.auth import', + r'from src\.web\.admin_autolabel import': 'from src.web.services.autolabel import', + r'from src\.web\.admin_scheduler import': 'from src.web.core.scheduler import', + + # Schemas + r'from src\.web\.admin_schemas import': 'from src.web.schemas.admin import', + r'from src\.web\.schemas import': 'from src.web.schemas.inference import', + + # Services + r'from src\.web\.services import': 'from src.web.services.inference import', + r'from src\.web\.async_service import': 'from src.web.services.async_processing import', + r'from src\.web\.batch_upload_service import': 'from src.web.services.batch_upload import', + + # Workers + r'from src\.web\.async_queue import': 'from src.web.workers.async_queue import', + r'from src\.web\.batch_queue import': 'from src.web.workers.batch_queue import', + + # Routes + r'from src\.web\.routes import': 'from src.web.api.v1.routes import', + r'from src\.web\.async_routes import': 'from src.web.api.v1.async_api.routes import', + r'from src\.web\.batch_upload_routes import': 'from src.web.api.v1.batch.routes import', +} + +def update_file(file_path: Path) -> bool: + """Update imports in a single file.""" + content = file_path.read_text(encoding='utf-8') + original_content = content + + for old_pattern, new_import in IMPORT_MAPPINGS.items(): + content = re.sub(old_pattern, new_import, content) + + if content != original_content: + file_path.write_text(content, encoding='utf-8') + return True + return False + +def main(): + """Update all test files.""" + test_dir = Path('tests/web') + updated_files = [] + + for test_file in test_dir.glob('test_*.py'): + if update_file(test_file): + updated_files.append(test_file.name) + + if updated_files: + print(f"✓ Updated {len(updated_files)} test files:") + for filename in sorted(updated_files): + print(f" - {filename}") + else: + print("No files needed updating") + +if __name__ == '__main__': + main()