WIP

2026-01-27 00:47:10 +01:00
parent e83a0cae36
commit 58bf75db68
141 changed files with 24814 additions and 3884 deletions
--- a/.claude/settings.json
+++ b/.claude/settings.json
@@ -7,7 +7,8 @@
      "Edit(*)",
      "Glob(*)",
      "Grep(*)",
-      "Task(*)"
+      "Task(*)",
      "Bash(wsl bash -c \"source ~/miniconda3/etc/profile.d/conda.sh && conda activate invoice-py311 && cd /mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2 && pytest tests/web/test_batch_upload_routes.py::TestBatchUploadRoutes::test_upload_batch_async_mode_default -v -s 2>&1 | head -100\")"
    ]
  }
 }
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -81,7 +81,13 @@
      "Bash(wsl bash -c \"cat /mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2/runs/train/invoice_fields/results.csv\")",
      "Bash(wsl bash -c \"ls -la /mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2/runs/train/invoice_fields/weights/\")",
      "Bash(wsl bash -c \"cat ''/mnt/c/Users/yaoji/AppData/Local/Temp/claude/c--Users-yaoji-git-ColaCoder-invoice-master-poc-v2/tasks/b8d8565.output'' 2>/dev/null | tail -100\")",
-      "Bash(wsl bash -c:*)"
+      "Bash(wsl bash -c:*)",
      "Bash(wsl bash -c \"cd /mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2 && source ~/miniconda3/etc/profile.d/conda.sh && conda activate invoice-py311 && python -m pytest tests/web/test_admin_*.py -v --tb=short 2>&1 | head -120\")",
      "Bash(wsl bash -c \"cd /mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2 && source ~/miniconda3/etc/profile.d/conda.sh && conda activate invoice-py311 && python -m pytest tests/web/test_admin_*.py -v --tb=short 2>&1 | head -80\")",
      "Bash(wsl bash -c \"cd /mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2 && source ~/miniconda3/etc/profile.d/conda.sh && conda activate invoice-py311 && python -m pytest tests/ -v --tb=short 2>&1 | tail -60\")",
      "Bash(wsl bash -c \"source ~/miniconda3/etc/profile.d/conda.sh && conda activate invoice-py311 && cd /mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2 && python -m pytest tests/data/test_admin_models_v2.py -v 2>&1 | head -100\")",
      "Bash(dir src\\\\web\\\\*admin* src\\\\web\\\\*batch*)",
      "Bash(wsl bash -c \"source ~/miniconda3/etc/profile.d/conda.sh && conda activate invoice-py311 && cd /mnt/c/Users/yaoji/git/ColaCoder/invoice-master-poc-v2 && python3 -c \"\"\n# Test FastAPI Form parsing behavior\nfrom fastapi import Form\nfrom typing import Annotated\n\n# Simulate what happens when data={''upload_source'': ''ui''} is sent\n# and async_mode is not in the data\nprint\\(''Test 1: async_mode not provided, default should be True''\\)\nprint\\(''Expected: True''\\)\n\n# In FastAPI, when Form has a default, it will use that default if not provided\n# But we need to verify this is actually happening\n\"\"\")"
    ],
    "deny": [],
    "ask": [],
--- a/.coverage
+++ b/.coverage
--- a/README.md
+++ b/README.md
@@ -76,6 +76,38 @@
 | 8 | payment_line | 支付行 (机器可读格式) |
 | 9 | customer_number | 客户编号 |
 ## DPI 配置
 **重要**: 系统所有组件统一使用 **150 DPI**，确保训练和推理的一致性。
 DPI（每英寸点数）设置必须在训练和推理时保持一致，否则会导致：
 - 检测框尺寸失配
 - mAP显著下降（可能从93.5%降到60-70%）
 - 字段漏检或误检
 ### 配置位置
 | 组件 | 配置文件 | 配置项 |
 |------|---------|--------|
 | **全局常量** | `src/config.py` | `DEFAULT_DPI = 150` |
 | **Web推理** | `src/web/config.py` | `ModelConfig.dpi` (导入自 `src.config`) |
 | **CLI推理** | `src/cli/infer.py` | `--dpi` 默认值 = `DEFAULT_DPI` |
 | **自动标注** | `src/config.py` | `AUTOLABEL['dpi'] = DEFAULT_DPI` |
 | **PDF转图** | `src/web/api/v1/admin/documents.py` | 使用 `DEFAULT_DPI` |
 ### 使用示例
 ```bash
 # 训练（使用默认150 DPI）
 python -m src.cli.autolabel --dual-pool --cpu-workers 3 --gpu-workers 1
 # 推理（默认150 DPI，与训练一致）
 python -m src.cli.infer -m runs/train/invoice_fields/weights/best.pt -i invoice.pdf
 # 手动指定DPI（仅当需要与非默认训练DPI的模型配合时）
 python -m src.cli.infer -m custom_model.pt -i invoice.pdf --dpi 150
 ```
 ## 安装
 ```bash
@@ -490,7 +522,7 @@ Options:
  --input, -i        输入 PDF/图像
  --output, -o       输出 JSON 路径
  --confidence       置信度阈值 (默认: 0.5)
-  --dpi              渲染 DPI (默认: 300)
+  --dpi              渲染 DPI (默认: 150, 必须与训练DPI一致)
  --gpu              使用 GPU
 ```
--- a/create_shims.sh
+++ b/create_shims.sh
@@ -0,0 +1,96 @@
 #!/bin/bash
 # Create backward compatibility shims for all migrated files
 # admin_auth.py -> core/auth.py
 cat > src/web/admin_auth.py << 'EOF'
 """DEPRECATED: Import from src.web.core.auth instead"""
 from src.web.core.auth import *  # noqa: F401, F403
 EOF
 # admin_autolabel.py -> services/autolabel.py
 cat > src/web/admin_autolabel.py << 'EOF'
 """DEPRECATED: Import from src.web.services.autolabel instead"""
 from src.web.services.autolabel import *  # noqa: F401, F403
 EOF
 # admin_scheduler.py -> core/scheduler.py
 cat > src/web/admin_scheduler.py << 'EOF'
 """DEPRECATED: Import from src.web.core.scheduler instead"""
 from src.web.core.scheduler import *  # noqa: F401, F403
 EOF
 # admin_schemas.py -> schemas/admin.py
 cat > src/web/admin_schemas.py << 'EOF'
 """DEPRECATED: Import from src.web.schemas.admin instead"""
 from src.web.schemas.admin import *  # noqa: F401, F403
 EOF
 # schemas.py -> schemas/inference.py + schemas/common.py
 cat > src/web/schemas.py << 'EOF'
 """DEPRECATED: Import from src.web.schemas.inference or src.web.schemas.common instead"""
 from src.web.schemas.inference import *  # noqa: F401, F403
 from src.web.schemas.common import *  # noqa: F401, F403
 EOF
 # services.py -> services/inference.py
 cat > src/web/services.py << 'EOF'
 """DEPRECATED: Import from src.web.services.inference instead"""
 from src.web.services.inference import *  # noqa: F401, F403
 EOF
 # async_queue.py -> workers/async_queue.py
 cat > src/web/async_queue.py << 'EOF'
 """DEPRECATED: Import from src.web.workers.async_queue instead"""
 from src.web.workers.async_queue import *  # noqa: F401, F403
 EOF
 # async_service.py -> services/async_processing.py
 cat > src/web/async_service.py << 'EOF'
 """DEPRECATED: Import from src.web.services.async_processing instead"""
 from src.web.services.async_processing import *  # noqa: F401, F403
 EOF
 # batch_queue.py -> workers/batch_queue.py
 cat > src/web/batch_queue.py << 'EOF'
 """DEPRECATED: Import from src.web.workers.batch_queue instead"""
 from src.web.workers.batch_queue import *  # noqa: F401, F403
 EOF
 # batch_upload_service.py -> services/batch_upload.py
 cat > src/web/batch_upload_service.py << 'EOF'
 """DEPRECATED: Import from src.web.services.batch_upload instead"""
 from src.web.services.batch_upload import *  # noqa: F401, F403
 EOF
 # batch_upload_routes.py -> api/v1/batch/routes.py
 cat > src/web/batch_upload_routes.py << 'EOF'
 """DEPRECATED: Import from src.web.api.v1.batch.routes instead"""
 from src.web.api.v1.batch.routes import *  # noqa: F401, F403
 EOF
 # admin_routes.py -> api/v1/admin/documents.py
 cat > src/web/admin_routes.py << 'EOF'
 """DEPRECATED: Import from src.web.api.v1.admin.documents instead"""
 from src.web.api.v1.admin.documents import *  # noqa: F401, F403
 EOF
 # admin_annotation_routes.py -> api/v1/admin/annotations.py
 cat > src/web/admin_annotation_routes.py << 'EOF'
 """DEPRECATED: Import from src.web.api.v1.admin.annotations instead"""
 from src.web.api.v1.admin.annotations import *  # noqa: F401, F403
 EOF
 # admin_training_routes.py -> api/v1/admin/training.py
 cat > src/web/admin_training_routes.py << 'EOF'
 """DEPRECATED: Import from src.web.api.v1.admin.training instead"""
 from src.web.api.v1.admin.training import *  # noqa: F401, F403
 EOF
 # routes.py -> api/v1/routes.py
 cat > src/web/routes.py << 'EOF'
 """DEPRECATED: Import from src.web.api.v1.routes instead"""
 from src.web.api.v1.routes import *  # noqa: F401, F403
 EOF
 echo "✓ Created backward compatibility shims for all migrated files"
--- a/docs/CODE_REVIEW_REPORT.md
+++ b/docs/CODE_REVIEW_REPORT.md
@@ -1,405 +0,0 @@
 # Invoice Master POC v2 - 代码审查报告
 **审查日期**: 2026-01-22
 **代码库规模**: 67 个 Python 源文件，约 22,434 行代码
 **测试覆盖率**: ~40-50%
 ---
 ## 执行摘要
 ### 总体评估：**良好（B+）**
 **优势**：
 - ✅ 清晰的模块化架构，职责分离良好
 - ✅ 使用了合适的数据类和类型提示
 - ✅ 针对瑞典发票的全面规范化逻辑
 - ✅ 空间索引优化（O(1) token 查找）
 - ✅ 完善的降级机制（YOLO 失败时的 OCR fallback）
 - ✅ 设计良好的 Web API 和 UI
 **主要问题**：
 - ❌ 支付行解析代码重复（3+ 处）
 - ❌ 长函数（`_normalize_customer_number` 127 行）
 - ❌ 配置安全问题（明文数据库密码）
 - ❌ 异常处理不一致（到处都是通用 Exception）
 - ❌ 缺少集成测试
 - ❌ 魔法数字散布各处（0.5, 0.95, 300 等）
 ---
 ## 1. 架构分析
 ### 1.1 模块结构
 ```
 src/
 ├── inference/        # 推理管道核心
 │   ├── pipeline.py           (517 行) ⚠️
 │   ├── field_extractor.py    (1,347 行) 🔴 太长
 │   └── yolo_detector.py
 ├── web/             # FastAPI Web 服务
 │   ├── app.py               (765 行) ⚠️ HTML 内联
 │   ├── routes.py            (184 行)
 │   └── services.py          (286 行)
 ├── ocr/             # OCR 提取
 │   ├── paddle_ocr.py
 │   └── machine_code_parser.py  (919 行) 🔴 太长
 ├── matcher/         # 字段匹配
 │   └── field_matcher.py     (875 行) ⚠️
 ├── utils/           # 共享工具
 │   ├── validators.py
 │   ├── text_cleaner.py
 │   ├── fuzzy_matcher.py
 │   ├── ocr_corrections.py
 │   └── format_variants.py   (610 行)
 ├── processing/      # 批处理
 ├── data/           # 数据管理
 └── cli/            # 命令行工具
 ```
 ### 1.2 推理流程
 ```
 PDF/Image 输入
    ↓
 渲染为图片 (pdf/renderer.py)
    ↓
 YOLO 检测 (yolo_detector.py) - 检测字段区域
    ↓
 字段提取 (field_extractor.py)
    ├→ OCR 文本提取 (ocr/paddle_ocr.py)
    ├→ 规范化 & 验证
    └→ 置信度计算
    ↓
 交叉验证 (pipeline.py)
    ├→ 解析 payment_line 格式
    ├→ 从 payment_line 提取 OCR/Amount/Account
    └→ 与检测字段验证，payment_line 值优先
    ↓
 降级 OCR（如果关键字段缺失）
    ├→ 全页 OCR
    └→ 正则提取
    ↓
 InferenceResult 输出
 ```
 ---
 ## 2. 代码质量问题
 ### 2.1 长函数（>50 行）🔴
 | 函数 | 文件 | 行数 | 复杂度 | 问题 |
 |------|------|------|--------|------|
 | `_normalize_customer_number()` | field_extractor.py | **127** | 极高 | 4 层模式匹配，7+ 正则，复杂评分 |
 | `_cross_validate_payment_line()` | pipeline.py | **127** | 极高 | 核心验证逻辑，8+ 条件分支 |
 | `_normalize_bankgiro()` | field_extractor.py | 62 | 高 | Luhn 验证 + 多种降级 |
 | `_normalize_plusgiro()` | field_extractor.py | 63 | 高 | 类似 bankgiro |
 | `_normalize_payment_line()` | field_extractor.py | 74 | 高 | 4 种正则模式 |
 | `_normalize_amount()` | field_extractor.py | 78 | 高 | 多策略降级 |
 **示例问题** - `_normalize_customer_number()` (第 776-902 行):
 ```python
 def _normalize_customer_number(self, text: str):
    # 127 行函数，包含：
    # - 4 个嵌套的 if/for 循环
    # - 7 种不同的正则模式
    # - 5 个评分机制
    # - 处理有标签和无标签格式
 ```
 **建议**: 拆分为：
 - `_find_customer_code_patterns()`
 - `_find_labeled_customer_code()`
 - `_score_customer_candidates()`
 ### 2.2 代码重复 🔴
 **支付行解析（3+ 处重复实现）**:
 1. `_parse_machine_readable_payment_line()` (pipeline.py:217-252)
 2. `MachineCodeParser.parse()` (machine_code_parser.py:919 行)
 3. `_normalize_payment_line()` (field_extractor.py:632-705)
 所有三处都实现类似的正则模式：
 ```
 格式: # <OCR> # <Kronor> <Öre> <Type> > <Account>#<Check>#
 ```
 **Bankgiro/Plusgiro 验证（重复）**:
 - `validators.py`: `is_valid_bankgiro()`, `format_bankgiro()`
 - `field_extractor.py`: `_normalize_bankgiro()`, `_normalize_plusgiro()`, `_luhn_checksum()`
 - `normalizer.py`: `normalize_bankgiro()`, `normalize_plusgiro()`
 - `field_matcher.py`: 类似匹配逻辑
 **建议**: 创建统一模块：
 ```python
 # src/common/payment_line_parser.py
 class PaymentLineParser:
    def parse(text: str) -> PaymentLineResult
 # src/common/giro_validator.py
 class GiroValidator:
    def validate_and_format(value: str, giro_type: str) -> str
 ```
 ### 2.3 错误处理不一致 ⚠️
 **通用异常捕获（31 处）**:
 ```python
 except Exception as e:  # 代码库中 31 处
    result.errors.append(str(e))
 ```
 **问题**:
 - 没有捕获特定错误类型
 - 通用错误消息丢失上下文
 - 第 142-147 行 (routes.py): 捕获所有异常，返回 500 状态
 **当前写法** (routes.py:142-147):
 ```python
 try:
    service_result = inference_service.process_pdf(...)
 except Exception as e:  # 太宽泛
    logger.error(f"Error processing document: {e}")
    raise HTTPException(status_code=500, detail=str(e))
 ```
 **改进建议**:
 ```python
 except FileNotFoundError:
    raise HTTPException(status_code=400, detail="PDF 文件未找到")
 except PyMuPDFError:
    raise HTTPException(status_code=400, detail="无效的 PDF 格式")
 except OCRError:
    raise HTTPException(status_code=503, detail="OCR 服务不可用")
 ```
 ### 2.4 配置安全问题 🔴
 **config.py 第 24-30 行** - 明文凭据：
 ```python
 DATABASE = {
    'host': '192.168.68.31',  # 硬编码 IP
    'user': 'docmaster',       # 硬编码用户名
    'password': 'nY6LYK5d',    # 🔴 明文密码！
    'database': 'invoice_master'
 }
 ```
 **建议**:
 ```python
 DATABASE = {
    'host': os.getenv('DB_HOST', 'localhost'),
    'user': os.getenv('DB_USER', 'docmaster'),
    'password': os.getenv('DB_PASSWORD'),  # 从环境变量读取
    'database': os.getenv('DB_NAME', 'invoice_master')
 }
 ```
 ### 2.5 魔法数字 ⚠️
 | 值 | 位置 | 用途 | 问题 |
 |---|------|------|------|
 | 0.5 | 多处 | 置信度阈值 | 不可按字段配置 |
 | 0.95 | pipeline.py | payment_line 置信度 | 无说明 |
 | 300 | 多处 | DPI | 硬编码 |
 | 0.1 | field_extractor.py | BBox 填充 | 应为配置 |
 | 72 | 多处 | PDF 基础 DPI | 公式中的魔法数字 |
 | 50 | field_extractor.py | 客户编号评分加分 | 无说明 |
 **建议**: 提取到配置：
 ```python
 INFERENCE_CONFIG = {
    'confidence_threshold': 0.5,
    'payment_line_confidence': 0.95,
    'dpi': 300,
    'bbox_padding': 0.1,
 }
 ```
 ### 2.6 命名不一致 ⚠️
 **字段名称不一致**:
 - YOLO 类名: `invoice_number`, `ocr_number`, `supplier_org_number`
 - 字段名: `InvoiceNumber`, `OCR`, `supplier_org_number`
 - CSV 列名: 可能又不同
 - 数据库字段名: 另一种变体
 映射维护在多处：
 - `yolo_detector.py` (90-100 行): `CLASS_TO_FIELD`
 - 多个其他位置
 ---
 ## 3. 测试分析
 ### 3.1 测试覆盖率
 **测试文件**: 13 个
 - ✅ 覆盖良好: field_matcher, normalizer, payment_line_parser
 - ⚠️ 中等覆盖: field_extractor, pipeline
 - ❌ 覆盖不足: web 层, CLI, 批处理
 **估算覆盖率**: 40-50%
 ### 3.2 缺失的测试用例 🔴
 **关键缺失**:
 1. 交叉验证逻辑 - 最复杂部分，测试很少
 2. payment_line 解析变体 - 多种实现，边界情况不清楚
 3. OCR 错误纠正 - 不同策略的复杂逻辑
 4. Web API 端点 - 没有请求/响应测试
 5. 批处理 - 多 worker 协调未测试
 6. 降级 OCR 机制 - YOLO 检测失败时
 ---
 ## 4. 架构风险
 ### 🔴 关键风险
 1. **配置安全** - config.py 中明文数据库凭据（24-30 行）
 2. **错误恢复** - 宽泛的异常处理掩盖真实问题
 3. **可测试性** - 硬编码依赖阻止单元测试
 ### 🟡 高风险
 1. **代码可维护性** - 支付行解析重复
 2. **可扩展性** - 没有长时间推理的异步处理
 3. **扩展性** - 添加新字段类型会很困难
 ### 🟢 中等风险
 1. **性能** - 懒加载有帮助，但 ORM 查询未优化
 2. **文档** - 大部分足够但可以更好
 ---
 ## 5. 优先级矩阵
 | 优先级 | 行动 | 工作量 | 影响 |
 |--------|------|--------|------|
 | 🔴 关键 | 修复配置安全（环境变量） | 1 小时 | 高 |
 | 🔴 关键 | 添加集成测试 | 2-3 天 | 高 |
 | 🔴 关键 | 文档化错误处理策略 | 4 小时 | 中 |
 | 🟡 高 | 统一 payment_line 解析 | 1-2 天 | 高 |
 | 🟡 高 | 提取规范化到子模块 | 2-3 天 | 中 |
 | 🟡 高 | 添加依赖注入 | 2-3 天 | 中 |
 | 🟡 高 | 拆分长函数 | 2-3 天 | 低 |
 | 🟢 中 | 提高测试覆盖率到 70%+ | 3-5 天 | 高 |
 | 🟢 中 | 提取魔法数字 | 4 小时 | 低 |
 | 🟢 中 | 标准化命名约定 | 1-2 天 | 中 |
 ---
 ## 6. 具体文件建议
 ### 高优先级（代码质量）
 | 文件 | 问题 | 建议 |
 |------|------|------|
 | `field_extractor.py` | 1,347 行；6 个长规范化方法 | 拆分为 `normalizers/` 子模块 |
 | `pipeline.py` | 127 行 `_cross_validate_payment_line()` | 提取到单独的 `CrossValidator` 类 |
 | `field_matcher.py` | 875 行；复杂匹配逻辑 | 拆分为 `matching/` 子模块 |
 | `config.py` | 硬编码凭据（第 29 行） | 使用环境变量 |
 | `machine_code_parser.py` | 919 行；payment_line 解析 | 与 pipeline 解析合并 |
 ### 中优先级（重构）
 | 文件 | 问题 | 建议 |
 |------|------|------|
 | `app.py` | 765 行；HTML 内联在 Python 中 | 提取到 `templates/` 目录 |
 | `autolabel.py` | 753 行；批处理逻辑 | 提取 worker 函数到模块 |
 | `format_variants.py` | 610 行；变体生成 | 考虑策略模式 |
 ---
 ## 7. 建议行动
 ### 第 1 阶段：关键修复（1 周）
 1. **配置安全** (1 小时)
   - 移除 config.py 中的明文密码
   - 添加环境变量支持
   - 更新 README 说明配置
 2. **错误处理标准化** (1 天)
   - 定义自定义异常类
   - 替换通用 Exception 捕获
   - 添加错误代码常量
 3. **添加关键集成测试** (2 天)
   - 端到端推理测试
   - payment_line 交叉验证测试
   - API 端点测试
 ### 第 2 阶段：重构（2-3 周）
 4. **统一 payment_line 解析** (2 天)
   - 创建 `src/common/payment_line_parser.py`
   - 合并 3 处重复实现
   - 迁移所有调用方
 5. **拆分 field_extractor.py** (3 天)
   - 创建 `src/inference/normalizers/` 子模块
   - 每个字段类型一个文件
   - 提取共享验证逻辑
 6. **拆分长函数** (2 天)
   - `_normalize_customer_number()` → 3 个函数
   - `_cross_validate_payment_line()` → CrossValidator 类
 ### 第 3 阶段：改进（1-2 周）
 7. **提高测试覆盖率** (5 天)
   - 目标：70%+ 覆盖率
   - 专注于验证逻辑
   - 添加边界情况测试
 8. **配置管理改进** (1 天)
   - 提取所有魔法数字
   - 创建配置文件（YAML）
   - 添加配置验证
 9. **文档改进** (2 天)
   - 添加架构图
   - 文档化所有私有方法
   - 创建贡献指南
 ---
 ## 附录 A：度量指标
 ### 代码复杂度
 | 类别 | 计数 | 平均行数 |
 |------|------|----------|
 | 源文件 | 67 | 334 |
 | 长文件 (>500 行) | 12 | 875 |
 | 长函数 (>50 行) | 23 | 89 |
 | 测试文件 | 13 | 298 |
 ### 依赖关系
 | 类型 | 计数 |
 |------|------|
 | 外部依赖 | ~25 |
 | 内部模块 | 10 |
 | 循环依赖 | 0 ✅ |
 ### 代码风格
 | 指标 | 覆盖率 |
 |------|--------|
 | 类型提示 | 80% |
 | Docstrings (公开) | 80% |
 | Docstrings (私有) | 40% |
 | 测试覆盖率 | 45% |
 ---
 **生成日期**: 2026-01-22
 **审查者**: Claude Code
 **版本**: v2.0
--- a/docs/FIELD_EXTRACTOR_ANALYSIS.md
+++ b/docs/FIELD_EXTRACTOR_ANALYSIS.md
@@ -1,96 +0,0 @@
 # Field Extractor 分析报告
 ## 概述
 field_extractor.py (1183行) 最初被识别为可优化文件，尝试使用 `src/normalize` 模块进行重构，但经过分析和测试后发现 **不应该重构**。
 ## 重构尝试
 ### 初始计划
 将 field_extractor.py 中的重复 normalize 方法删除，统一使用 `src/normalize/normalize_field()` 接口。
 ### 实施步骤
 1. ✅ 备份原文件 (`field_extractor_old.py`)
 2. ✅ 修改 `_normalize_and_validate` 使用统一 normalizer
 3. ✅ 删除重复的 normalize 方法 (~400行)
 4. ❌ 运行测试 - **28个失败**
 5. ✅ 添加 wrapper 方法委托给 normalizer
 6. ❌ 再次测试 - **12个失败**
 7. ✅ 还原原文件
 8. ✅ 测试通过 - **全部45个测试通过**
 ## 关键发现
 ### 两个模块的不同用途
 | 模块 | 用途 | 输入 | 输出 | 示例 |
 |------|------|------|------|------|
 | **src/normalize/** | **变体生成** 用于匹配 | 已提取的字段值 | 多个匹配变体列表 | `"INV-12345"` → `["INV-12345", "12345"]` |
 | **field_extractor** | **值提取** 从OCR文本 | 包含字段的原始OCR文本 | 提取的单个字段值 | `"Fakturanummer: A3861"` → `"A3861"` |
 ### 为什么不能统一？
 1. **src/normalize/** 的设计目的:
   - 接收已经提取的字段值
   - 生成多个标准化变体用于fuzzy matching
   - 例如 BankgiroNormalizer:
     ```python
     normalize("782-1713") → ["7821713", "782-1713"]  # 生成变体
     ```
 2. **field_extractor** 的 normalize 方法:
   - 接收包含字段的原始OCR文本（可能包含标签、其他文本等）
   - **提取**特定模式的字段值
   - 例如 `_normalize_bankgiro`:
     ```python
     _normalize_bankgiro("Bankgiro: 782-1713") → ("782-1713", True, None)  # 从文本提取
     ```
 3. **关键区别**:
   - Normalizer: 变体生成器 (for matching)
   - Field Extractor: 模式提取器 (for parsing)
 ### 测试失败示例
 使用 normalizer 替代 field extractor 方法后的失败:
 ```python
 # InvoiceNumber 测试
 Input: "Fakturanummer: A3861"
 期望: "A3861"
 实际: "Fakturanummer: A3861"  # 没有提取，只是清理
 # Bankgiro 测试
 Input: "Bankgiro: 782-1713"
 期望: "782-1713"
 实际: "7821713"  # 返回了不带破折号的变体，而不是提取格式化值
 ```
 ## 结论
 **field_extractor.py 不应该使用 src/normalize 模块重构**，因为:
 1. ✅ **职责不同**: 提取 vs 变体生成
 2. ✅ **输入不同**: 包含标签的原始OCR文本 vs 已提取的字段值
 3. ✅ **输出不同**: 单个提取值 vs 多个匹配变体
 4. ✅ **现有代码运行良好**: 所有45个测试通过
 5. ✅ **提取逻辑有价值**: 包含复杂的模式匹配规则（例如区分 Bankgiro/Plusgiro 格式）
 ## 建议
 1. **保留 field_extractor.py 原样**: 不进行重构
 2. **文档化两个模块的差异**: 确保团队理解各自用途
 3. **关注其他优化目标**: machine_code_parser.py (919行)
 ## 学习点
 重构前应该:
 1. 理解模块的**真实用途**，而不只是看代码相似度
 2. 运行完整测试套件验证假设
 3. 评估是否真的存在重复，还是表面相似但用途不同
 ---
 **状态**: ✅ 分析完成，决定不重构
 **测试**: ✅ 45/45 通过
 **文件**: 保持 1183行 原样
--- a/docs/MACHINE_CODE_PARSER_ANALYSIS.md
+++ b/docs/MACHINE_CODE_PARSER_ANALYSIS.md
@@ -1,238 +0,0 @@
 # Machine Code Parser 分析报告
 ## 文件概况
 - **文件**: `src/ocr/machine_code_parser.py`
 - **总行数**: 919 行
 - **代码行**: 607 行 (66%)
 - **方法数**: 14 个
 - **正则表达式使用**: 47 次
 ## 代码结构
 ### 类结构
 ```
 MachineCodeResult (数据类)
 ├── to_dict()
 └── get_region_bbox()
 MachineCodeParser (主解析器)
 ├── __init__()
 ├── parse() - 主入口
 ├── _find_tokens_with_values()
 ├── _find_machine_code_line_tokens()
 ├── _parse_standard_payment_line_with_tokens()
 ├── _parse_standard_payment_line() - 142行 ⚠️
 ├── _extract_ocr() - 50行
 ├── _extract_bankgiro() - 58行
 ├── _extract_plusgiro() - 30行
 ├── _extract_amount() - 68行
 ├── _calculate_confidence()
 └── cross_validate()
 ```
 ## 发现的问题
 ### 1. ⚠️ `_parse_standard_payment_line` 方法过长 (142行)
 **位置**: 442-582 行
 **问题**:
 - 包含嵌套函数 `normalize_account_spaces` 和 `format_account`
 - 多个正则匹配分支
 - 逻辑复杂，难以测试和维护
 **建议**:
 可以拆分为独立方法:
 - `_normalize_account_spaces(line)`
 - `_format_account(account_digits, context)`
 - `_match_primary_pattern(line)`
 - `_match_fallback_patterns(line)`
 ### 2. 🔁 4个 `_extract_*` 方法有重复模式
 所有 extract 方法都遵循相同模式：
 ```python
 def _extract_XXX(self, tokens):
    candidates = []
    for token in tokens:
        text = token.text.strip()
        matches = self.XXX_PATTERN.findall(text)
        for match in matches:
            # 验证逻辑
            # 上下文检测
            candidates.append((normalized, context_score, token))
    if not candidates:
        return None
    candidates.sort(key=lambda x: (x[1], 1), reverse=True)
    return candidates[0][0]
 ```
 **重复的逻辑**:
 - Token 迭代
 - 模式匹配
 - 候选收集
 - 上下文评分
 - 排序和选择最佳匹配
 **建议**:
 可以提取基础提取器类或通用方法来减少重复。
 ### 3. ✅ 上下文检测重复
 上下文检测代码在多个地方重复：
 ```python
 # _extract_bankgiro 中
 context_text = ' '.join(t.text.lower() for t in tokens)
 is_bankgiro_context = (
    'bankgiro' in context_text or
    'bg:' in context_text or
    'bg ' in context_text
 )
 # _extract_plusgiro 中
 context_text = ' '.join(t.text.lower() for t in tokens)
 is_plusgiro_context = (
    'plusgiro' in context_text or
    'postgiro' in context_text or
    'pg:' in context_text or
    'pg ' in context_text
 )
 # _parse_standard_payment_line 中
 context = (context_line or raw_line).lower()
 is_plusgiro_context = (
    ('plusgiro' in context or 'postgiro' in context or 'plusgirokonto' in context)
    and 'bankgiro' not in context
 )
 ```
 **建议**:
 提取为独立方法:
 - `_detect_account_context(tokens) -> dict[str, bool]`
 ## 重构建议
 ### 方案 A: 轻度重构（推荐）✅
 **目标**: 提取重复的上下文检测逻辑，不改变主要结构
 **步骤**:
 1. 提取 `_detect_account_context(tokens)` 方法
 2. 提取 `_normalize_account_spaces(line)` 为独立方法
 3. 提取 `_format_account(digits, context)` 为独立方法
 **影响**:
 - 减少 ~50-80 行重复代码
 - 提高可测试性
 - 低风险，易于验证
 **预期结果**: 919 行 → ~850 行 (↓7%)
 ### 方案 B: 中度重构
 **目标**: 创建通用的字段提取框架
 **步骤**:
 1. 创建 `_generic_extract(pattern, normalizer, context_checker)`
 2. 重构所有 `_extract_*` 方法使用通用框架
 3. 拆分 `_parse_standard_payment_line` 为多个小方法
 **影响**:
 - 减少 ~150-200 行代码
 - 显著提高可维护性
 - 中等风险，需要全面测试
 **预期结果**: 919 行 → ~720 行 (↓22%)
 ### 方案 C: 深度重构（不推荐）
 **目标**: 完全重新设计为策略模式
 **风险**:
 - 高风险，可能引入 bugs
 - 需要大量测试
 - 可能破坏现有集成
 ## 推荐方案
 ### ✅ 采用方案 A（轻度重构）
 **理由**:
 1. **代码已经工作良好**: 没有明显的 bug 或性能问题
 2. **低风险**: 只提取重复逻辑，不改变核心算法
 3. **性价比高**: 小改动带来明显的代码质量提升
 4. **易于验证**: 现有测试应该能覆盖
 ### 重构步骤
 ```python
 # 1. 提取上下文检测
 def _detect_account_context(self, tokens: list[TextToken]) -> dict[str, bool]:
    """检测上下文中的账户类型关键词"""
    context_text = ' '.join(t.text.lower() for t in tokens)
    return {
        'bankgiro': any(kw in context_text for kw in ['bankgiro', 'bg:', 'bg ']),
        'plusgiro': any(kw in context_text for kw in ['plusgiro', 'postgiro', 'plusgirokonto', 'pg:', 'pg ']),
    }
 # 2. 提取空格标准化
 def _normalize_account_spaces(self, line: str) -> str:
    """移除账户号码中的空格"""
    # (现有 line 460-481 的代码)
 # 3. 提取账户格式化
 def _format_account(
    self,
    account_digits: str,
    is_plusgiro_context: bool
 ) -> tuple[str, str]:
    """格式化账户并确定类型"""
    # (现有 line 485-523 的代码)
 ```
 ## 对比：field_extractor vs machine_code_parser
 | 特征 | field_extractor | machine_code_parser |
 |------|-----------------|---------------------|
 | 用途 | 值提取 | 机器码解析 |
 | 重复代码 | ~400行normalize方法 | ~80行上下文检测 |
 | 重构价值 | ❌ 不同用途，不应统一 | ✅ 可提取共享逻辑 |
 | 风险 | 高（会破坏功能） | 低（只是代码组织） |
 ## 决策
 ### ✅ 建议重构 machine_code_parser.py
 **与 field_extractor 的不同**:
 - field_extractor: 重复的方法有**不同的用途**（提取 vs 变体生成）
 - machine_code_parser: 重复的代码有**相同的用途**（都是上下文检测）
 **预期收益**:
 - 减少 ~70 行重复代码
 - 提高可测试性（可以单独测试上下文检测）
 - 更清晰的代码组织
 - **低风险**，易于验证
 ## 下一步
 1. ✅ 备份原文件
 2. ✅ 提取 `_detect_account_context` 方法
 3. ✅ 提取 `_normalize_account_spaces` 方法
 4. ✅ 提取 `_format_account` 方法
 5. ✅ 更新所有调用点
 6. ✅ 运行测试验证
 7. ✅ 检查代码覆盖率
 ---
 **状态**: 📋 分析完成，建议轻度重构
 **风险评估**: 🟢 低风险
 **预期收益**: 919行 → ~850行 (↓7%)
--- a/docs/PERFORMANCE_OPTIMIZATION.md
+++ b/docs/PERFORMANCE_OPTIMIZATION.md
@@ -1,519 +0,0 @@
 # Performance Optimization Guide
 This document provides performance optimization recommendations for the Invoice Field Extraction system.
 ## Table of Contents
 1. [Batch Processing Optimization](#batch-processing-optimization)
 2. [Database Query Optimization](#database-query-optimization)
 3. [Caching Strategies](#caching-strategies)
 4. [Memory Management](#memory-management)
 5. [Profiling and Monitoring](#profiling-and-monitoring)
 ---
 ## Batch Processing Optimization
 ### Current State
 The system processes invoices one at a time. For large batches, this can be inefficient.
 ### Recommendations
 #### 1. Database Batch Operations
 **Current**: Individual inserts for each document
 ```python
 # Inefficient
 for doc in documents:
    db.insert_document(doc)  # Individual DB call
 ```
 **Optimized**: Use `execute_values` for batch inserts
 ```python
 # Efficient - already implemented in db.py line 519
 from psycopg2.extras import execute_values
 execute_values(cursor, """
    INSERT INTO documents (...)
    VALUES %s
 """, document_values)
 ```
 **Impact**: 10-50x faster for batches of 100+ documents
 #### 2. PDF Processing Batching
 **Recommendation**: Process PDFs in parallel using multiprocessing
 ```python
 from multiprocessing import Pool
 def process_batch(pdf_paths, batch_size=10):
    """Process PDFs in parallel batches."""
    with Pool(processes=batch_size) as pool:
        results = pool.map(pipeline.process_pdf, pdf_paths)
    return results
 ```
 **Considerations**:
 - GPU models should use a shared process pool (already exists: `src/processing/gpu_pool.py`)
 - CPU-intensive tasks can use separate process pool (`src/processing/cpu_pool.py`)
 - Current dual pool coordinator (`dual_pool_coordinator.py`) already supports this pattern
 **Status**: ✅ Already implemented in `src/processing/` modules
 #### 3. Image Caching for Multi-Page PDFs
 **Current**: Each page rendered independently
 ```python
 # Current pattern in field_extractor.py
 for page_num in range(total_pages):
    image = render_pdf_page(pdf_path, page_num, dpi=300)
 ```
 **Optimized**: Pre-render all pages if processing multiple fields per page
 ```python
 # Batch render
 images = {
    page_num: render_pdf_page(pdf_path, page_num, dpi=300)
    for page_num in page_numbers_needed
 }
 # Reuse images
 for detection in detections:
    image = images[detection.page_no]
    extract_field(detection, image)
 ```
 **Impact**: Reduces redundant PDF rendering by 50-90% for multi-field invoices
 ---
 ## Database Query Optimization
 ### Current Performance
 - **Parameterized queries**: ✅ Implemented (Phase 1)
 - **Connection pooling**: ❌ Not implemented
 - **Query batching**: ✅ Partially implemented
 - **Index optimization**: ⚠️ Needs verification
 ### Recommendations
 #### 1. Connection Pooling
 **Current**: New connection for each operation
 ```python
 def connect(self):
    """Create new database connection."""
    return psycopg2.connect(**self.config)
 ```
 **Optimized**: Use connection pooling
 ```python
 from psycopg2 import pool
 class DocumentDatabase:
    def __init__(self, config):
        self.pool = pool.SimpleConnectionPool(
            minconn=1,
            maxconn=10,
            **config
        )
    def connect(self):
        return self.pool.getconn()
    def close(self, conn):
        self.pool.putconn(conn)
 ```
 **Impact**:
 - Reduces connection overhead by 80-95%
 - Especially important for high-frequency operations
 #### 2. Index Recommendations
 **Check current indexes**:
 ```sql
 -- Verify indexes exist on frequently queried columns
 SELECT tablename, indexname, indexdef
 FROM pg_indexes
 WHERE schemaname = 'public';
 ```
 **Recommended indexes**:
 ```sql
 -- If not already present
 CREATE INDEX IF NOT EXISTS idx_documents_success
    ON documents(success);
 CREATE INDEX IF NOT EXISTS idx_documents_timestamp
    ON documents(timestamp DESC);
 CREATE INDEX IF NOT EXISTS idx_field_results_document_id
    ON field_results(document_id);
 CREATE INDEX IF NOT EXISTS idx_field_results_matched
    ON field_results(matched);
 CREATE INDEX IF NOT EXISTS idx_field_results_field_name
    ON field_results(field_name);
 ```
 **Impact**:
 - 10-100x faster queries for filtered/sorted results
 - Critical for `get_failed_matches()` and `get_all_documents_summary()`
 #### 3. Query Batching
 **Status**: ✅ Already implemented for field results (line 519)
 **Verify batching is used**:
 ```python
 # Good pattern in db.py
 execute_values(cursor, "INSERT INTO field_results (...) VALUES %s", field_values)
 ```
 **Additional opportunity**: Batch `SELECT` queries
 ```python
 # Current
 docs = [get_document(doc_id) for doc_id in doc_ids]  # N queries
 # Optimized
 docs = get_documents_batch(doc_ids)  # 1 query with IN clause
 ```
 **Status**: ✅ Already implemented (`get_documents_batch` exists in db.py)
 ---
 ## Caching Strategies
 ### 1. Model Loading Cache
 **Current**: Models loaded per-instance
 **Recommendation**: Singleton pattern for YOLO model
 ```python
 class YOLODetectorSingleton:
    _instance = None
    _model = None
    @classmethod
    def get_instance(cls, model_path):
        if cls._instance is None:
            cls._instance = YOLODetector(model_path)
        return cls._instance
 ```
 **Impact**: Reduces memory usage by 90% when processing multiple documents
 ### 2. Parser Instance Caching
 **Current**: ✅ Already optimal
 ```python
 # Good pattern in field_extractor.py
 def __init__(self):
    self.payment_line_parser = PaymentLineParser()  # Reused
    self.customer_number_parser = CustomerNumberParser()  # Reused
 ```
 **Status**: No changes needed
 ### 3. OCR Result Caching
 **Recommendation**: Cache OCR results for identical regions
 ```python
 from functools import lru_cache
@lru_cache(maxsize=1000)
 def ocr_region_cached(image_hash, bbox):
    """Cache OCR results by image hash + bbox."""
    return paddle_ocr.ocr_region(image, bbox)
 ```
 **Impact**: 50-80% speedup when re-processing similar documents
 **Note**: Requires implementing image hashing (e.g., `hashlib.md5(image.tobytes())`)
 ---
 ## Memory Management
 ### Current Issues
 **Potential memory leaks**:
 1. Large images kept in memory after processing
 2. OCR results accumulated without cleanup
 3. Model outputs not explicitly cleared
 ### Recommendations
 #### 1. Explicit Image Cleanup
 ```python
 import gc
 def process_pdf(pdf_path):
    try:
        image = render_pdf(pdf_path)
        result = extract_fields(image)
        return result
    finally:
        del image  # Explicit cleanup
        gc.collect()  # Force garbage collection
 ```
 #### 2. Generator Pattern for Large Batches
 **Current**: Load all documents into memory
 ```python
 docs = [process_pdf(path) for path in pdf_paths]  # All in memory
 ```
 **Optimized**: Use generator for streaming processing
 ```python
 def process_batch_streaming(pdf_paths):
    """Process documents one at a time, yielding results."""
    for path in pdf_paths:
        result = process_pdf(path)
        yield result
        # Result can be saved to DB immediately
        # Previous result is garbage collected
 ```
 **Impact**: Constant memory usage regardless of batch size
 #### 3. Context Managers for Resources
 ```python
 class InferencePipeline:
    def __enter__(self):
        self.detector.load_model()
        return self
    def __exit__(self, *args):
        self.detector.unload_model()
        self.extractor.cleanup()
 # Usage
 with InferencePipeline(...) as pipeline:
    results = pipeline.process_pdf(path)
 # Automatic cleanup
 ```
 ---
 ## Profiling and Monitoring
 ### Recommended Profiling Tools
 #### 1. cProfile for CPU Profiling
 ```python
 import cProfile
 import pstats
 profiler = cProfile.Profile()
 profiler.enable()
 # Your code here
 pipeline.process_pdf(pdf_path)
 profiler.disable()
 stats = pstats.Stats(profiler)
 stats.sort_stats('cumulative')
 stats.print_stats(20)  # Top 20 slowest functions
 ```
 #### 2. memory_profiler for Memory Analysis
 ```bash
 pip install memory_profiler
 python -m memory_profiler your_script.py
 ```
 Or decorator-based:
 ```python
 from memory_profiler import profile
@profile
 def process_large_batch(pdf_paths):
    # Memory usage tracked line-by-line
    results = [process_pdf(path) for path in pdf_paths]
    return results
 ```
 #### 3. py-spy for Production Profiling
 ```bash
 pip install py-spy
 # Profile running process
 py-spy top --pid 12345
 # Generate flamegraph
 py-spy record -o profile.svg -- python your_script.py
 ```
 **Advantage**: No code changes needed, minimal overhead
 ### Key Metrics to Monitor
 1. **Processing Time per Document**
   - Target: <10 seconds for single-page invoice
   - Current: ~2-5 seconds (estimated)
 2. **Memory Usage**
   - Target: <2GB for batch of 100 documents
   - Monitor: Peak memory usage
 3. **Database Query Time**
   - Target: <100ms per query (with indexes)
   - Monitor: Slow query log
 4. **OCR Accuracy vs Speed Trade-off**
   - Current: PaddleOCR with GPU (~200ms per region)
   - Alternative: Tesseract (~500ms, slightly more accurate)
 ### Logging Performance Metrics
 **Add to pipeline.py**:
 ```python
 import time
 import logging
 logger = logging.getLogger(__name__)
 def process_pdf(self, pdf_path):
    start = time.time()
    # Processing...
    result = self._process_internal(pdf_path)
    elapsed = time.time() - start
    logger.info(f"Processed {pdf_path} in {elapsed:.2f}s")
    # Log to database for analysis
    self.db.log_performance({
        'document_id': result.document_id,
        'processing_time': elapsed,
        'field_count': len(result.fields)
    })
    return result
 ```
 ---
 ## Performance Optimization Priorities
 ### High Priority (Implement First)
 1. ✅ **Database parameterized queries** - Already done (Phase 1)
 2. ⚠️ **Database connection pooling** - Not implemented
 3. ⚠️ **Index optimization** - Needs verification
 ### Medium Priority
 4. ⚠️ **Batch PDF rendering** - Optimization possible
 5. ✅ **Parser instance reuse** - Already done (Phase 2)
 6. ⚠️ **Model caching** - Could improve
 ### Low Priority (Nice to Have)
 7. ⚠️ **OCR result caching** - Complex implementation
 8. ⚠️ **Generator patterns** - Refactoring needed
 9. ⚠️ **Advanced profiling** - For production optimization
 ---
 ## Benchmarking Script
 ```python
 """
 Benchmark script for invoice processing performance.
 """
 import time
 from pathlib import Path
 from src.inference.pipeline import InferencePipeline
 def benchmark_single_document(pdf_path, iterations=10):
    """Benchmark single document processing."""
    pipeline = InferencePipeline(
        model_path="path/to/model.pt",
        use_gpu=True
    )
    times = []
    for i in range(iterations):
        start = time.time()
        result = pipeline.process_pdf(pdf_path)
        elapsed = time.time() - start
        times.append(elapsed)
        print(f"Iteration {i+1}: {elapsed:.2f}s")
    avg_time = sum(times) / len(times)
    print(f"\nAverage: {avg_time:.2f}s")
    print(f"Min: {min(times):.2f}s")
    print(f"Max: {max(times):.2f}s")
 def benchmark_batch(pdf_paths, batch_size=10):
    """Benchmark batch processing."""
    from multiprocessing import Pool
    pipeline = InferencePipeline(
        model_path="path/to/model.pt",
        use_gpu=True
    )
    start = time.time()
    with Pool(processes=batch_size) as pool:
        results = pool.map(pipeline.process_pdf, pdf_paths)
    elapsed = time.time() - start
    avg_per_doc = elapsed / len(pdf_paths)
    print(f"Total time: {elapsed:.2f}s")
    print(f"Documents: {len(pdf_paths)}")
    print(f"Average per document: {avg_per_doc:.2f}s")
    print(f"Throughput: {len(pdf_paths)/elapsed:.2f} docs/sec")
 if __name__ == "__main__":
    # Single document benchmark
    benchmark_single_document("test.pdf")
    # Batch benchmark
    pdf_paths = list(Path("data/test_pdfs").glob("*.pdf"))
    benchmark_batch(pdf_paths[:100])
 ```
 ---
 ## Summary
 **Implemented (Phase 1-2)**:
 - ✅ Parameterized queries (SQL injection fix)
 - ✅ Parser instance reuse (Phase 2 refactoring)
 - ✅ Batch insert operations (execute_values)
 - ✅ Dual pool processing (CPU/GPU separation)
 **Quick Wins (Low effort, high impact)**:
 - Database connection pooling (2-4 hours)
 - Index verification and optimization (1-2 hours)
 - Batch PDF rendering (4-6 hours)
 **Long-term Improvements**:
 - OCR result caching with hashing
 - Generator patterns for streaming
 - Advanced profiling and monitoring
 **Expected Impact**:
 - Connection pooling: 80-95% reduction in DB overhead
 - Indexes: 10-100x faster queries
 - Batch rendering: 50-90% less redundant work
 - **Overall**: 2-5x throughput improvement for batch processing
--- a/docs/REFACTORING_PLAN.md
+++ b/docs/REFACTORING_PLAN.md
--- a/docs/REFACTORING_SUMMARY.md
+++ b/docs/REFACTORING_SUMMARY.md
@@ -1,170 +0,0 @@
 # 代码重构总结报告
 ## 📊 整体成果
 ### 测试状态
 - ✅ **688/688 测试全部通过** (100%)
 - ✅ **代码覆盖率**: 34% → 37% (+3%)
 - ✅ **0 个失败**, 0 个错误
 ### 测试覆盖率改进
 - ✅ **machine_code_parser**: 25% → 65% (+40%)
 - ✅ **新增测试**: 55个（633 → 688）
 ---
 ## 🎯 已完成的重构
 ### 1. ✅ Matcher 模块化 (876行 → 205行, ↓76%)
 **文件**: 
 **重构内容**:
 - 将单一876行文件拆分为 **11个模块**
 - 提取 **5种独立的匹配策略**
 - 创建专门的数据模型、工具函数和上下文处理模块
 **新模块结构**:
 **测试结果**:
 - ✅ 77个 matcher 测试全部通过
 - ✅ 完整的README文档
 - ✅ 策略模式，易于扩展
 **收益**:
 - 📉 代码量减少 76%
 - 📈 可维护性显著提高
 - ✨ 每个策略独立测试
 - 🔧 易于添加新策略
 ---
 ### 2. ✅ Machine Code Parser 轻度重构 + 测试覆盖 (919行 → 929行)
 **文件**: src/ocr/machine_code_parser.py
 **重构内容**:
 - 提取 **3个共享辅助方法**，消除重复代码
 - 优化上下文检测逻辑
 - 简化账号格式化方法
 **测试改进**:
 - ✅ **新增55个测试**（24 → 79个）
 - ✅ **覆盖率**: 25% → 65% (+40%)
 - ✅ 所有688个项目测试通过
 **新增测试覆盖**:
 - **第一轮** (22个测试):
  - `_detect_account_context()` - 8个测试（上下文检测）
  - `_normalize_account_spaces()` - 5个测试（空格规范化）
  - `_format_account()` - 4个测试（账号格式化）
  - `parse()` - 5个测试（主入口方法）
 - **第二轮** (33个测试):
  - `_extract_ocr()` - 8个测试（OCR 提取）
  - `_extract_bankgiro()` - 9个测试（Bankgiro 提取）
  - `_extract_plusgiro()` - 8个测试（Plusgiro 提取）
  - `_extract_amount()` - 8个测试（金额提取）
 **收益**:
 - 🔄 消除80行重复代码
 - 📈 可测试性提高（可独立测试辅助方法）
 - 📖 代码可读性提升
 - ✅ 覆盖率从25%提升到65% (+40%)
 - 🎯 低风险，高回报
 ---
 ### 3. ✅ Field Extractor 分析 (决定不重构)
 **文件**:  (1183行)
 **分析结果**: ❌ **不应重构**
 **关键洞察**:
 - 表面相似的代码可能有**完全不同的用途**
 - field_extractor: **解析/提取** 字段值
 - src/normalize: **标准化/生成变体** 用于匹配
 - 两者职责不同，不应统一
 **文档**: 
 ---
 ## 📈 重构统计
 ### 代码行数变化
 | 文件 | 重构前 | 重构后 | 变化 | 百分比 |
 |------|--------|--------|------|--------|
 | **matcher/field_matcher.py** | 876行 | 205行 | -671 | ↓76% |
 | **matcher/* (新增10个模块)** | 0行 | 466行 | +466 | 新增 |
 | **matcher 总计** | 876行 | 671行 | -205 | ↓23% |
 | **ocr/machine_code_parser.py** | 919行 | 929行 | +10 | +1% |
 | **总净减少** | - | - | **-195行** | **↓11%** |
 ### 测试覆盖
 | 模块 | 测试数 | 通过率 | 覆盖率 | 状态 |
 |------|--------|--------|--------|------|
 | matcher | 77 | 100% | - | ✅ |
 | field_extractor | 45 | 100% | 39% | ✅ |
 | machine_code_parser | 79 | 100% | 65% | ✅ |
 | normalizer | ~120 | 100% | - | ✅ |
 | 其他模块 | ~367 | 100% | - | ✅ |
 | **总计** | **688** | **100%** | **37%** | ✅ |
 ---
 ## 🎓 重构经验总结
 ### 成功经验
 1. **✅ 先测试后重构**
   - 所有重构都有完整测试覆盖
   - 每次改动后立即验证测试
   - 100%测试通过率保证质量
 2. **✅ 识别真正的重复**
   - 不是所有相似代码都是重复
   - field_extractor vs normalizer: 表面相似但用途不同
   - machine_code_parser: 真正的代码重复
 3. **✅ 渐进式重构**
   - matcher: 大规模模块化 (策略模式)
   - machine_code_parser: 轻度重构 (提取共享方法)
   - field_extractor: 分析后决定不重构
 ### 关键决策
 #### ✅ 应该重构的情况
 - **matcher**: 单一文件过长 (876行)，包含多种策略
 - **machine_code_parser**: 多处相同用途的重复代码
 #### ❌ 不应重构的情况
 - **field_extractor**: 相似代码有不同用途
 ### 教训
 **不要盲目追求DRY原则**
 > 相似代码不一定是重复。要理解代码的**真实用途**。
 ---
 ## ✅ 总结
 **关键成果**:
 - 📉 净减少 195 行代码
 - 📈 代码覆盖率 +3% (34% → 37%)
 - ✅ 测试数量 +55 (633 → 688)
 - 🎯 machine_code_parser 覆盖率 +40% (25% → 65%)
 - ✨ 模块化程度显著提高
 - 🎯 可维护性大幅提升
 **重要教训**:
 > 相似的代码不一定是重复的代码。理解代码的真实用途，才能做出正确的重构决策。
 **下一步建议**:
 1. 继续提升 machine_code_parser 覆盖率到 80%+ (目前 65%)
 2. 为其他低覆盖模块添加测试（field_extractor 39%, pipeline 19%）
 3. 完善边界条件和异常情况的测试
--- a/docs/TEST_COVERAGE_IMPROVEMENT.md
+++ b/docs/TEST_COVERAGE_IMPROVEMENT.md
@@ -1,258 +0,0 @@
 # 测试覆盖率改进报告
 ## 📊 改进概览
 ### 整体统计
 - ✅ **测试总数**: 633 → 688 (+55个测试, +8.7%)
 - ✅ **通过率**: 100% (688/688)
 - ✅ **整体覆盖率**: 34% → 37% (+3%)
 ### machine_code_parser.py 专项改进
 - ✅ **测试数**: 24 → 79 (+55个测试, +229%)
 - ✅ **覆盖率**: 25% → 65% (+40%)
 - ✅ **未覆盖行**: 273 → 129 (减少144行)
 ---
 ## 🎯 新增测试详情
 ### 第一轮改进 (22个测试)
 #### 1. TestDetectAccountContext (8个测试)
 测试新增的 `_detect_account_context()` 辅助方法。
 **测试用例**:
 1. `test_bankgiro_keyword` - 检测 'bankgiro' 关键词
 2. `test_bg_keyword` - 检测 'bg:' 缩写
 3. `test_plusgiro_keyword` - 检测 'plusgiro' 关键词
 4. `test_postgiro_keyword` - 检测 'postgiro' 别名
 5. `test_pg_keyword` - 检测 'pg:' 缩写
 6. `test_both_contexts` - 同时存在两种关键词
 7. `test_no_context` - 无账号关键词
 8. `test_case_insensitive` - 大小写不敏感检测
 **覆盖的代码路径**:
 ```python
 def _detect_account_context(self, tokens: list[TextToken]) -> dict[str, bool]:
    context_text = ' '.join(t.text.lower() for t in tokens)
    return {
        'bankgiro': any(kw in context_text for kw in ['bankgiro', 'bg:', 'bg ']),
        'plusgiro': any(kw in context_text for kw in ['plusgiro', 'postgiro', 'plusgirokonto', 'pg:', 'pg ']),
    }
 ```
 ---
 ### 2. TestNormalizeAccountSpacesMethod (5个测试)
 测试新增的 `_normalize_account_spaces()` 辅助方法。
 **测试用例**:
 1. `test_removes_spaces_after_arrow` - 移除 > 后的空格
 2. `test_multiple_consecutive_spaces` - 处理多个连续空格
 3. `test_no_arrow_returns_unchanged` - 无 > 标记时返回原值
 4. `test_spaces_before_arrow_preserved` - 保留 > 前的空格
 5. `test_empty_string` - 空字符串处理
 **覆盖的代码路径**:
 ```python
 def _normalize_account_spaces(self, line: str) -> str:
    if '>' not in line:
        return line
    parts = line.split('>', 1)
    after_arrow = parts[1]
    normalized = re.sub(r'(\d)\s+(\d)', r'\1\2', after_arrow)
    while re.search(r'(\d)\s+(\d)', normalized):
        normalized = re.sub(r'(\d)\s+(\d)', r'\1\2', normalized)
    return parts[0] + '>' + normalized
 ```
 ---
 ### 3. TestFormatAccount (4个测试)
 测试新增的 `_format_account()` 辅助方法。
 **测试用例**:
 1. `test_plusgiro_context_forces_plusgiro` - Plusgiro 上下文强制格式化为 Plusgiro
 2. `test_valid_bankgiro_7_digits` - 7位有效 Bankgiro 格式化
 3. `test_valid_bankgiro_8_digits` - 8位有效 Bankgiro 格式化
 4. `test_defaults_to_bankgiro_when_ambiguous` - 模糊情况默认 Bankgiro
 **覆盖的代码路径**:
 ```python
 def _format_account(self, account_digits: str, is_plusgiro_context: bool) -> tuple[str, str]:
    if is_plusgiro_context:
        formatted = f"{account_digits[:-1]}-{account_digits[-1]}"
        return formatted, 'plusgiro'
    # Luhn 验证逻辑
    pg_valid = FieldValidators.is_valid_plusgiro(account_digits)
    bg_valid = FieldValidators.is_valid_bankgiro(account_digits)
    # 决策逻辑
    if pg_valid and not bg_valid:
        return pg_formatted, 'plusgiro'
    elif bg_valid and not pg_valid:
        return bg_formatted, 'bankgiro'
    else:
        return bg_formatted, 'bankgiro'
 ```
 ---
 ### 4. TestParseMethod (5个测试)
 测试主入口 `parse()` 方法。
 **测试用例**:
 1. `test_parse_empty_tokens` - 空 token 列表处理
 2. `test_parse_finds_payment_line_in_bottom_region` - 在页面底部35%区域查找付款行
 3. `test_parse_ignores_top_region` - 忽略页面顶部区域
 4. `test_parse_with_context_keywords` - 检测上下文关键词
 5. `test_parse_stores_source_tokens` - 存储源 token
 **覆盖的代码路径**:
 - Token 过滤（底部区域检测）
 - 上下文关键词检测
 - 付款行查找和解析
 - 结果对象构建
 ---
 ### 第二轮改进 (33个测试)
 #### 5. TestExtractOCR (8个测试)
 测试 `_extract_ocr()` 方法 - OCR 参考号码提取。
 **测试用例**:
 1. `test_extract_valid_ocr_10_digits` - 提取10位 OCR 号码
 2. `test_extract_valid_ocr_15_digits` - 提取15位 OCR 号码
 3. `test_extract_ocr_with_hash_markers` - 带 # 标记的 OCR
 4. `test_extract_longest_ocr_when_multiple` - 多个候选时选最长
 5. `test_extract_ocr_ignores_short_numbers` - 忽略短于10位的数字
 6. `test_extract_ocr_ignores_long_numbers` - 忽略长于25位的数字
 7. `test_extract_ocr_excludes_bankgiro_variants` - 排除 Bankgiro 变体
 8. `test_extract_ocr_empty_tokens` - 空 token 处理
 #### 6. TestExtractBankgiro (9个测试)
 测试 `_extract_bankgiro()` 方法 - Bankgiro 账号提取。
 **测试用例**:
 1. `test_extract_bankgiro_7_digits_with_dash` - 带破折号的7位 Bankgiro
 2. `test_extract_bankgiro_7_digits_without_dash` - 无破折号的7位 Bankgiro
 3. `test_extract_bankgiro_8_digits_with_dash` - 带破折号的8位 Bankgiro
 4. `test_extract_bankgiro_8_digits_without_dash` - 无破折号的8位 Bankgiro
 5. `test_extract_bankgiro_with_spaces` - 带空格的 Bankgiro
 6. `test_extract_bankgiro_handles_plusgiro_format` - 处理 Plusgiro 格式
 7. `test_extract_bankgiro_with_context` - 带上下文关键词
 8. `test_extract_bankgiro_ignores_plusgiro_context` - 忽略 Plusgiro 上下文
 9. `test_extract_bankgiro_empty_tokens` - 空 token 处理
 #### 7. TestExtractPlusgiro (8个测试)
 测试 `_extract_plusgiro()` 方法 - Plusgiro 账号提取。
 **测试用例**:
 1. `test_extract_plusgiro_7_digits_with_dash` - 带破折号的7位 Plusgiro
 2. `test_extract_plusgiro_7_digits_without_dash` - 无破折号的7位 Plusgiro
 3. `test_extract_plusgiro_8_digits` - 8位 Plusgiro
 4. `test_extract_plusgiro_with_spaces` - 带空格的 Plusgiro
 5. `test_extract_plusgiro_with_context` - 带上下文关键词
 6. `test_extract_plusgiro_ignores_too_short` - 忽略少于7位
 7. `test_extract_plusgiro_ignores_too_long` - 忽略多于8位
 8. `test_extract_plusgiro_empty_tokens` - 空 token 处理
 #### 8. TestExtractAmount (8个测试)
 测试 `_extract_amount()` 方法 - 金额提取。
 **测试用例**:
 1. `test_extract_amount_with_comma_decimal` - 逗号小数分隔符
 2. `test_extract_amount_with_dot_decimal` - 点号小数分隔符
 3. `test_extract_amount_integer` - 整数金额
 4. `test_extract_amount_with_thousand_separator` - 千位分隔符
 5. `test_extract_amount_large_number` - 大额金额
 6. `test_extract_amount_ignores_too_large` - 忽略过大金额
 7. `test_extract_amount_ignores_zero` - 忽略零或负数
 8. `test_extract_amount_empty_tokens` - 空 token 处理
 ---
 ## 📈 覆盖率分析
 ### 已覆盖的方法
 ✅ `_detect_account_context()` - **100%** (第一轮新增)
 ✅ `_normalize_account_spaces()` - **100%** (第一轮新增)
 ✅ `_format_account()` - **95%** (第一轮新增)
 ✅ `parse()` - **70%** (第一轮改进)
 ✅ `_parse_standard_payment_line()` - **95%** (已有测试)
 ✅ `_extract_ocr()` - **85%** (第二轮新增)
 ✅ `_extract_bankgiro()` - **90%** (第二轮新增)
 ✅ `_extract_plusgiro()` - **90%** (第二轮新增)
 ✅ `_extract_amount()` - **80%** (第二轮新增)
 ### 仍需改进的方法 (未覆盖/部分覆盖)
 ⚠️ `_calculate_confidence()` - **0%** (未测试)
 ⚠️ `cross_validate()` - **0%** (未测试)
 ⚠️ `get_region_bbox()` - **0%** (未测试)
 ⚠️ `_find_tokens_with_values()` - **部分覆盖**
 ⚠️ `_find_machine_code_line_tokens()` - **部分覆盖**
 ### 未覆盖的代码行（129行）
 主要集中在：
 1. **验证方法** (lines 805-824): `_calculate_confidence`, `cross_validate`
 2. **辅助方法** (lines 80-92, 336-369, 377-407): Token 查找、bbox 计算、日志记录
 3. **边界条件** (lines 648-653, 690, 699, 759-760等): 某些提取方法的边界情况
 ---
 ## 🎯 改进建议
 ### ✅ 已完成目标
 - ✅ 覆盖率从 25% 提升到 65% (+40%)
 - ✅ 测试数量从 24 增加到 79 (+55个)
 - ✅ 提取方法全部测试（_extract_ocr, _extract_bankgiro, _extract_plusgiro, _extract_amount）
 ### 下一步目标（覆盖率 65% → 80%+）
 1. **添加验证方法测试** - 为 `_calculate_confidence`, `cross_validate` 添加测试
 2. **添加辅助方法测试** - 为 token 查找和 bbox 计算方法添加测试
 3. **完善边界条件** - 增加边界情况和异常处理的测试
 4. **集成测试** - 添加端到端的集成测试，使用真实 PDF token 数据
 ---
 ## ✅ 已完成的改进
 ### 重构收益
 - ✅ 提取的3个辅助方法现在可以独立测试
 - ✅ 测试粒度更细，更容易定位问题
 - ✅ 代码可读性提高，测试用例清晰易懂
 ### 质量保证
 - ✅ 所有655个测试100%通过
 - ✅ 无回归问题
 - ✅ 新增测试覆盖了之前未测试的重构代码
 ---
 ## 📚 测试编写经验
 ### 成功经验
 1. **使用 fixture 创建测试数据** - `_create_token()` 辅助方法简化了 token 创建
 2. **按方法组织测试类** - 每个方法一个测试类，结构清晰
 3. **测试用例命名清晰** - `test_<what>_<condition>` 格式，一目了然
 4. **覆盖关键路径** - 优先测试常见场景和边界条件
 ### 遇到的问题
 1. **Token 初始化参数** - 忘记了 `page_no` 参数，导致初始测试失败
   - 解决：修复 `_create_token()` 辅助方法，添加 `page_no=0`
 ---
 **报告日期**: 2026-01-24
 **状态**: ✅ 完成
 **下一步**: 继续提升覆盖率到 60%+
--- a/docs/multi_pool_design.md
+++ b/docs/multi_pool_design.md
@@ -1,619 +0,0 @@
 # 多池处理架构设计文档
 ## 1. 研究总结
 ### 1.1 当前问题分析
 我们之前实现的双池模式存在稳定性问题，主要原因：
 | 问题 | 原因 | 解决方案 |
 |------|------|----------|
 | 处理卡住 | 线程 + ProcessPoolExecutor 混用导致死锁 | 使用 asyncio 或纯 Queue 模式 |
 | Queue.get() 无限阻塞 | 没有超时机制 | 添加 timeout 和哨兵值 |
 | GPU 内存冲突 | 多进程同时访问 GPU | 限制 GPU worker = 1 |
 | CUDA fork 问题 | Linux 默认 fork 不兼容 CUDA | 使用 spawn 启动方式 |
 ### 1.2 推荐架构方案
 经过研究，最适合我们场景的方案是 **生产者-消费者队列模式**：
 ```
 ┌─────────────────┐     ┌─────────────────┐     ┌─────────────────┐
 │   Main Process  │     │   CPU Workers   │     │   GPU Worker    │
 │                 │     │  (4 processes)  │     │  (1 process)    │
 │  ┌───────────┐  │     │                 │     │                 │
 │  │ Task      │──┼────▶│ Text PDF处理    │     │ Scanned PDF处理 │
 │  │ Dispatcher│  │     │ (无需OCR)       │     │ (PaddleOCR)     │
 │  └───────────┘  │     │                 │     │                 │
 │       ▲         │     │       │         │     │       │         │
 │       │         │     │       ▼         │     │       ▼         │
 │  ┌───────────┐  │     │  Result Queue   │     │  Result Queue   │
 │  │ Result    │◀─┼─────│◀────────────────│─────│◀────────────────│
 │  │ Collector │  │     │                 │     │                 │
 │  └───────────┘  │     └─────────────────┘     └─────────────────┘
 │       │         │
 │       ▼         │
 │  ┌───────────┐  │
 │  │ Database  │  │
 │  │ Batch     │  │
 │  │ Writer    │  │
 │  └───────────┘  │
 └─────────────────┘
 ```
 ---
 ## 2. 核心设计原则
 ### 2.1 CUDA 兼容性
 ```python
 # 关键：使用 spawn 启动方式
 import multiprocessing as mp
 ctx = mp.get_context("spawn")
 # GPU worker 初始化时设置设备
 def init_gpu_worker(gpu_id: int = 0):
    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
    global _ocr
    from paddleocr import PaddleOCR
    _ocr = PaddleOCR(use_gpu=True, ...)
 ```
 ### 2.2 Worker 初始化模式
 使用 `initializer` 参数一次性加载模型，避免每个任务重新加载：
 ```python
 # 全局变量保存模型
 _ocr = None
 def init_worker(use_gpu: bool, gpu_id: int = 0):
    global _ocr
    if use_gpu:
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
    else:
        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
    from paddleocr import PaddleOCR
    _ocr = PaddleOCR(use_gpu=use_gpu, ...)
 # 创建 Pool 时使用 initializer
 pool = ProcessPoolExecutor(
    max_workers=1,
    initializer=init_worker,
    initargs=(True, 0),  # use_gpu=True, gpu_id=0
    mp_context=mp.get_context("spawn")
 )
 ```
 ### 2.3 队列模式 vs as_completed
 | 方式 | 优点 | 缺点 | 适用场景 |
 |------|------|------|----------|
 | `as_completed()` | 简单、无需管理队列 | 无法跨多个 Pool 使用 | 单池场景 |
 | `multiprocessing.Queue` | 高性能、灵活 | 需要手动管理、死锁风险 | 多池流水线 |
 | `Manager().Queue()` | 可 pickle、跨 Pool | 性能较低 | 需要 Pool.map 场景 |
 **推荐**：对于双池场景，使用 `as_completed()` 分别处理每个池，然后合并结果。
 ---
 ## 3. 详细开发计划
 ### 阶段 1：重构基础架构 (2-3天)
 #### 1.1 创建 WorkerPool 抽象类
 ```python
 # src/processing/worker_pool.py
 from __future__ import annotations
 from abc import ABC, abstractmethod
 from concurrent.futures import ProcessPoolExecutor, Future
 from dataclasses import dataclass
 from typing import List, Any, Optional, Callable
 import multiprocessing as mp
@dataclass
 class TaskResult:
    """任务结果容器"""
    task_id: str
    success: bool
    data: Any
    error: Optional[str] = None
    processing_time: float = 0.0
 class WorkerPool(ABC):
    """Worker Pool 抽象基类"""
    def __init__(self, max_workers: int, use_gpu: bool = False, gpu_id: int = 0):
        self.max_workers = max_workers
        self.use_gpu = use_gpu
        self.gpu_id = gpu_id
        self._executor: Optional[ProcessPoolExecutor] = None
    @abstractmethod
    def get_initializer(self) -> Callable:
        """返回 worker 初始化函数"""
        pass
    @abstractmethod
    def get_init_args(self) -> tuple:
        """返回初始化参数"""
        pass
    def start(self):
        """启动 worker pool"""
        ctx = mp.get_context("spawn")
        self._executor = ProcessPoolExecutor(
            max_workers=self.max_workers,
            mp_context=ctx,
            initializer=self.get_initializer(),
            initargs=self.get_init_args()
        )
    def submit(self, fn: Callable, *args, **kwargs) -> Future:
        """提交任务"""
        if not self._executor:
            raise RuntimeError("Pool not started")
        return self._executor.submit(fn, *args, **kwargs)
    def shutdown(self, wait: bool = True):
        """关闭 pool"""
        if self._executor:
            self._executor.shutdown(wait=wait)
            self._executor = None
    def __enter__(self):
        self.start()
        return self
    def __exit__(self, *args):
        self.shutdown()
 ```
 #### 1.2 实现 CPU 和 GPU Worker Pool
 ```python
 # src/processing/cpu_pool.py
 class CPUWorkerPool(WorkerPool):
    """CPU-only worker pool for text PDF processing"""
    def __init__(self, max_workers: int = 4):
        super().__init__(max_workers=max_workers, use_gpu=False)
    def get_initializer(self) -> Callable:
        return init_cpu_worker
    def get_init_args(self) -> tuple:
        return ()
 # src/processing/gpu_pool.py
 class GPUWorkerPool(WorkerPool):
    """GPU worker pool for OCR processing"""
    def __init__(self, max_workers: int = 1, gpu_id: int = 0):
        super().__init__(max_workers=max_workers, use_gpu=True, gpu_id=gpu_id)
    def get_initializer(self) -> Callable:
        return init_gpu_worker
    def get_init_args(self) -> tuple:
        return (self.gpu_id,)
 ```
 ---
 ### 阶段 2：实现双池协调器 (2-3天)
 #### 2.1 任务分发器
 ```python
 # src/processing/task_dispatcher.py
 from dataclasses import dataclass
 from enum import Enum, auto
 from typing import List, Tuple
 class TaskType(Enum):
    CPU = auto()  # Text PDF
    GPU = auto()  # Scanned PDF
@dataclass
 class Task:
    id: str
    task_type: TaskType
    data: Any
 class TaskDispatcher:
    """根据 PDF 类型分发任务到不同的 pool"""
    def classify_task(self, doc_info: dict) -> TaskType:
        """判断文档是否需要 OCR"""
        # 基于 PDF 特征判断
        if self._is_scanned_pdf(doc_info):
            return TaskType.GPU
        return TaskType.CPU
    def _is_scanned_pdf(self, doc_info: dict) -> bool:
        """检测是否为扫描件"""
        # 1. 检查是否有可提取文本
        # 2. 检查图片比例
        # 3. 检查文本密度
        pass
    def partition_tasks(self, tasks: List[Task]) -> Tuple[List[Task], List[Task]]:
        """将任务分为 CPU 和 GPU 两组"""
        cpu_tasks = [t for t in tasks if t.task_type == TaskType.CPU]
        gpu_tasks = [t for t in tasks if t.task_type == TaskType.GPU]
        return cpu_tasks, gpu_tasks
 ```
 #### 2.2 双池协调器
 ```python
 # src/processing/dual_pool_coordinator.py
 from concurrent.futures import as_completed
 from typing import List, Iterator
 import logging
 logger = logging.getLogger(__name__)
 class DualPoolCoordinator:
    """协调 CPU 和 GPU 两个 worker pool"""
    def __init__(
        self,
        cpu_workers: int = 4,
        gpu_workers: int = 1,
        gpu_id: int = 0
    ):
        self.cpu_pool = CPUWorkerPool(max_workers=cpu_workers)
        self.gpu_pool = GPUWorkerPool(max_workers=gpu_workers, gpu_id=gpu_id)
        self.dispatcher = TaskDispatcher()
    def __enter__(self):
        self.cpu_pool.start()
        self.gpu_pool.start()
        return self
    def __exit__(self, *args):
        self.cpu_pool.shutdown()
        self.gpu_pool.shutdown()
    def process_batch(
        self,
        documents: List[dict],
        cpu_task_fn: Callable,
        gpu_task_fn: Callable,
        on_result: Optional[Callable[[TaskResult], None]] = None,
        on_error: Optional[Callable[[str, Exception], None]] = None
    ) -> List[TaskResult]:
        """
        处理一批文档，自动分发到 CPU 或 GPU pool
        Args:
            documents: 待处理文档列表
            cpu_task_fn: CPU 任务处理函数
            gpu_task_fn: GPU 任务处理函数
            on_result: 结果回调（可选）
            on_error: 错误回调（可选）
        Returns:
            所有任务结果列表
        """
        # 分类任务
        tasks = [
            Task(id=doc['id'], task_type=self.dispatcher.classify_task(doc), data=doc)
            for doc in documents
        ]
        cpu_tasks, gpu_tasks = self.dispatcher.partition_tasks(tasks)
        logger.info(f"Task partition: {len(cpu_tasks)} CPU, {len(gpu_tasks)} GPU")
        # 提交任务到各自的 pool
        cpu_futures = {
            self.cpu_pool.submit(cpu_task_fn, t.data): t.id
            for t in cpu_tasks
        }
        gpu_futures = {
            self.gpu_pool.submit(gpu_task_fn, t.data): t.id
            for t in gpu_tasks
        }
        # 收集结果
        results = []
        all_futures = list(cpu_futures.keys()) + list(gpu_futures.keys())
        for future in as_completed(all_futures):
            task_id = cpu_futures.get(future) or gpu_futures.get(future)
            pool_type = "CPU" if future in cpu_futures else "GPU"
            try:
                data = future.result(timeout=300)  # 5分钟超时
                result = TaskResult(task_id=task_id, success=True, data=data)
                if on_result:
                    on_result(result)
            except Exception as e:
                logger.error(f"[{pool_type}] Task {task_id} failed: {e}")
                result = TaskResult(task_id=task_id, success=False, data=None, error=str(e))
                if on_error:
                    on_error(task_id, e)
            results.append(result)
        return results
 ```
 ---
 ### 阶段 3：集成到 autolabel (1-2天)
 #### 3.1 修改 autolabel.py
 ```python
 # src/cli/autolabel.py
 def run_autolabel_dual_pool(args):
    """使用双池模式运行自动标注"""
    from src.processing.dual_pool_coordinator import DualPoolCoordinator
    # 初始化数据库批处理
    db_batch = []
    db_batch_size = 100
    def on_result(result: TaskResult):
        """处理成功结果"""
        nonlocal db_batch
        db_batch.append(result.data)
        if len(db_batch) >= db_batch_size:
            save_documents_batch(db_batch)
            db_batch.clear()
    def on_error(task_id: str, error: Exception):
        """处理错误"""
        logger.error(f"Task {task_id} failed: {error}")
    # 创建双池协调器
    with DualPoolCoordinator(
        cpu_workers=args.cpu_workers or 4,
        gpu_workers=args.gpu_workers or 1,
        gpu_id=0
    ) as coordinator:
        # 处理所有 CSV
        for csv_file in csv_files:
            documents = load_documents_from_csv(csv_file)
            results = coordinator.process_batch(
                documents=documents,
                cpu_task_fn=process_text_pdf,
                gpu_task_fn=process_scanned_pdf,
                on_result=on_result,
                on_error=on_error
            )
            logger.info(f"CSV {csv_file}: {len(results)} processed")
    # 保存剩余批次
    if db_batch:
        save_documents_batch(db_batch)
 ```
 ---
 ### 阶段 4：测试与验证 (1-2天)
 #### 4.1 单元测试
 ```python
 # tests/unit/test_dual_pool.py
 import pytest
 from src.processing.dual_pool_coordinator import DualPoolCoordinator, TaskResult
 class TestDualPoolCoordinator:
    def test_cpu_only_batch(self):
        """测试纯 CPU 任务批处理"""
        with DualPoolCoordinator(cpu_workers=2, gpu_workers=1) as coord:
            docs = [{"id": f"doc_{i}", "type": "text"} for i in range(10)]
            results = coord.process_batch(docs, cpu_fn, gpu_fn)
            assert len(results) == 10
            assert all(r.success for r in results)
    def test_mixed_batch(self):
        """测试混合任务批处理"""
        with DualPoolCoordinator(cpu_workers=2, gpu_workers=1) as coord:
            docs = [
                {"id": "text_1", "type": "text"},
                {"id": "scan_1", "type": "scanned"},
                {"id": "text_2", "type": "text"},
            ]
            results = coord.process_batch(docs, cpu_fn, gpu_fn)
            assert len(results) == 3
    def test_timeout_handling(self):
        """测试超时处理"""
        pass
    def test_error_recovery(self):
        """测试错误恢复"""
        pass
 ```
 #### 4.2 集成测试
 ```python
 # tests/integration/test_autolabel_dual_pool.py
 def test_autolabel_with_dual_pool():
    """端到端测试双池模式"""
    # 使用少量测试数据
    result = subprocess.run([
        "python", "-m", "src.cli.autolabel",
        "--cpu-workers", "2",
        "--gpu-workers", "1",
        "--limit", "50"
    ], capture_output=True)
    assert result.returncode == 0
    # 验证数据库记录
 ```
 ---
 ## 4. 关键技术点
 ### 4.1 避免死锁的策略
 ```python
 # 1. 使用 timeout
 try:
    result = future.result(timeout=300)
 except TimeoutError:
    logger.warning(f"Task timed out")
 # 2. 使用哨兵值
 SENTINEL = object()
 queue.put(SENTINEL)  # 发送结束信号
 # 3. 检查进程状态
 if not worker.is_alive():
    logger.error("Worker died unexpectedly")
    break
 # 4. 先清空队列再 join
 while not queue.empty():
    results.append(queue.get_nowait())
 worker.join(timeout=5.0)
 ```
 ### 4.2 PaddleOCR 特殊处理
 ```python
 # PaddleOCR 必须在 worker 进程中初始化
 def init_paddle_worker(gpu_id: int):
    global _ocr
    import os
    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
    # 延迟导入，确保 CUDA 环境变量生效
    from paddleocr import PaddleOCR
    _ocr = PaddleOCR(
        use_angle_cls=True,
        lang='en',
        use_gpu=True,
        show_log=False,
        # 重要：设置 GPU 内存比例
        gpu_mem=2000  # 限制 GPU 内存使用 (MB)
    )
 ```
 ### 4.3 资源监控
 ```python
 import psutil
 import GPUtil
 def get_resource_usage():
    """获取系统资源使用情况"""
    cpu_percent = psutil.cpu_percent(interval=1)
    memory = psutil.virtual_memory()
    gpu_info = []
    for gpu in GPUtil.getGPUs():
        gpu_info.append({
            "id": gpu.id,
            "memory_used": gpu.memoryUsed,
            "memory_total": gpu.memoryTotal,
            "utilization": gpu.load * 100
        })
    return {
        "cpu_percent": cpu_percent,
        "memory_percent": memory.percent,
        "gpu": gpu_info
    }
 ```
 ---
 ## 5. 风险评估与应对
 | 风险 | 可能性 | 影响 | 应对策略 |
 |------|--------|------|----------|
 | GPU 内存不足 | 中 | 高 | 限制 GPU worker = 1，设置 gpu_mem 参数 |
 | 进程僵死 | 低 | 高 | 添加心跳检测，超时自动重启 |
 | 任务分类错误 | 中 | 中 | 添加回退机制，CPU 失败后尝试 GPU |
 | 数据库写入瓶颈 | 低 | 中 | 增大批处理大小，异步写入 |
 ---
 ## 6. 备选方案
 如果上述方案仍存在问题，可以考虑：
 ### 6.1 使用 Ray
 ```python
 import ray
 ray.init()
@ray.remote(num_cpus=1)
 def cpu_task(data):
    return process_text_pdf(data)
@ray.remote(num_gpus=1)
 def gpu_task(data):
    return process_scanned_pdf(data)
 # 自动资源调度
 futures = [cpu_task.remote(d) for d in cpu_docs]
 futures += [gpu_task.remote(d) for d in gpu_docs]
 results = ray.get(futures)
 ```
 ### 6.2 单池 + 动态 GPU 调度
 保持单池模式，但在每个任务内部动态决定是否使用 GPU：
 ```python
 def process_document(doc_data):
    if is_scanned_pdf(doc_data):
        # 使用 GPU (需要全局锁或信号量控制并发)
        with gpu_semaphore:
            return process_with_ocr(doc_data)
    else:
        return process_text_only(doc_data)
 ```
 ---
 ## 7. 时间线总结
 | 阶段 | 任务 | 预计工作量 |
 |------|------|------------|
 | 阶段 1 | 基础架构重构 | 2-3 天 |
 | 阶段 2 | 双池协调器实现 | 2-3 天 |
 | 阶段 3 | 集成到 autolabel | 1-2 天 |
 | 阶段 4 | 测试与验证 | 1-2 天 |
 | **总计** | | **6-10 天** |
 ---
 ## 8. 参考资料
 1. [Python concurrent.futures 官方文档](https://docs.python.org/3/library/concurrent.futures.html)
 2. [PyTorch Multiprocessing Best Practices](https://docs.pytorch.org/docs/stable/notes/multiprocessing.html)
 3. [Super Fast Python - ProcessPoolExecutor 完整指南](https://superfastpython.com/processpoolexecutor-in-python/)
 4. [PaddleOCR 并行推理文档](http://www.paddleocr.ai/main/en/version3.x/pipeline_usage/instructions/parallel_inference.html)
 5. [AWS - 跨 CPU/GPU 并行化 ML 推理](https://aws.amazon.com/blogs/machine-learning/parallelizing-across-multiple-cpu-gpus-to-speed-up-deep-learning-inference-at-the-edge/)
 6. [Ray 分布式多进程处理](https://docs.ray.io/en/latest/ray-more-libs/multiprocessing.html)
--- a/docs/product-plan-v2.md
+++ b/docs/product-plan-v2.md
--- a/docs/ux-design-prompt-v2.md
+++ b/docs/ux-design-prompt-v2.md
@@ -0,0 +1,302 @@
 # Document Annotation Tool – UX Design Spec v2
 ## Theme: Warm Graphite (Modern Enterprise)
 ---
 ## 1. Design Principles (Updated)
 1. **Clarity** – High contrast, but never pure black-on-white
 2. **Warm Neutrality** – Slightly warm grays reduce visual fatigue
 3. **Focus** – Content-first layouts with restrained accents
 4. **Consistency** – Reusable patterns, predictable behavior
 5. **Professional Trust** – Calm, serious, enterprise-ready
 6. **Longevity** – No trendy colors that age quickly
 ---
 ## 2. Color Palette (Warm Graphite)
 ### Core Colors
 | Usage | Color Name | Hex |
 |------|-----------|-----|
 | Primary Text | Soft Black | #121212 |
 | Secondary Text | Charcoal Gray | #2A2A2A |
 | Muted Text | Warm Gray | #6B6B6B |
 | Disabled Text | Light Warm Gray | #9A9A9A |
 ### Backgrounds
 | Usage | Color | Hex |
 |-----|------|-----|
 | App Background | Paper White | #FAFAF8 |
 | Card / Panel | White | #FFFFFF |
 | Hover Surface | Subtle Warm Gray | #F1F0ED |
 | Selected Row | Very Light Warm Gray | #ECEAE6 |
 ### Borders & Dividers
 | Usage | Color | Hex |
 |------|------|-----|
 | Default Border | Warm Light Gray | #E6E4E1 |
 | Strong Divider | Neutral Gray | #D8D6D2 |
 ### Semantic States (Muted & Professional)
 | State | Color | Hex |
 |------|-------|-----|
 | Success | Olive Gray | #3E4A3A |
 | Error | Brick Gray | #4A3A3A |
 | Warning | Sand Gray | #4A4A3A |
 | Info | Graphite Gray | #3A3A3A |
 > Accent colors are **never saturated** and are used only for status, progress, or selection.
 ---
 ## 3. Typography
 - **Font Family**: Inter / SF Pro / system-ui
 - **Headings**:
  - Weight: 600–700
  - Color: #121212
  - Letter spacing: -0.01em
 - **Body Text**:
  - Weight: 400
  - Color: #2A2A2A
 - **Captions / Meta**:
  - Weight: 400
  - Color: #6B6B6B
 - **Monospace (IDs / Values)**:
  - JetBrains Mono / SF Mono
  - Color: #2A2A2A
 ---
 ## 4. Global Layout
 ### Top Navigation Bar
 - Height: 56px
 - Background: #FAFAF8
 - Bottom Border: 1px solid #E6E4E1
 - Logo: Text or icon in #121212
 **Navigation Items**
 - Default: #6B6B6B
 - Hover: #2A2A2A
 - Active:
  - Text: #121212
  - Bottom indicator: 2px solid #3A3A3A (rounded ends)
 **Avatar**
 - Circle background: #ECEAE6
 - Text: #2A2A2A
 ---
 ## 5. Page: Documents (Dashboard)
 ### Page Header
 - Title: "Documents" (#121212)
 - Actions:
  - Primary button: Dark graphite outline
  - Secondary button: Subtle border only
 ### Filters Bar
 - Background: #FFFFFF
 - Border: 1px solid #E6E4E1
 - Inputs:
  - Background: #FFFFFF
  - Hover: #F1F0ED
  - Focus ring: 1px #3A3A3A
 ### Document Table
 - Table background: #FFFFFF
 - Header text: #6B6B6B
 - Row hover: #F1F0ED
 - Row selected:
  - Background: #ECEAE6
  - Left indicator: 3px solid #3A3A3A
 ### Status Badges
 - Pending:
  - BG: #FFFFFF
  - Border: #D8D6D2
  - Text: #2A2A2A
 - Labeled:
  - BG: #2A2A2A
  - Text: #FFFFFF
 - Exported:
  - BG: #ECEAE6
  - Text: #2A2A2A
  - Icon: ✓
 ### Auto-label States
 - Running:
  - Progress bar: #3A3A3A on #ECEAE6
 - Completed:
  - Text: #3E4A3A
 - Failed:
  - BG: #F1EDED
  - Text: #4A3A3A
 ---
 ## 6. Upload Modals (Single & Batch)
 ### Modal Container
 - Background: #FFFFFF
 - Border radius: 8px
 - Shadow: 0 1px 3px rgba(0,0,0,0.08)
 ### Drop Zone
 - Background: #FAFAF8
 - Border: 1px dashed #D8D6D2
 - Hover: #F1F0ED
 - Icon: Graphite gray
 ### Form Fields
 - Input BG: #FFFFFF
 - Border: #D8D6D2
 - Focus: 1px solid #3A3A3A
 Primary Action Button:
 - Text: #FFFFFF
 - BG: #2A2A2A
 - Hover: #121212
 ---
 ## 7. Document Detail View
 ### Canvas Area
 - Background: #FFFFFF
 - Annotation styles:
  - Manual: Solid border #2A2A2A
  - Auto: Dashed border #6B6B6B
  - Selected: 2px border #3A3A3A + resize handles
 ### Right Info Panel
 - Card background: #FFFFFF
 - Section headers: #121212
 - Meta text: #6B6B6B
 ### Annotation Table
 - Same table styles as Documents
 - Inline edit:
  - Input background: #FAFAF8
  - Save button: Graphite
 ### Locked State (Auto-label Running)
 - Banner BG: #FAFAF8
 - Border-left: 3px solid #4A4A3A
 - Progress bar: Graphite
 ---
 ## 8. Training Page
 ### Document Selector
 - Selected rows use same highlight rules
 - Verified state:
  - Full: Olive gray check
  - Partial: Sand gray warning
 ### Configuration Panel
 - Card layout
 - Inputs aligned to grid
 - Schedule option visually muted until enabled
 Primary CTA:
 - Start Training button in dark graphite
 ---
 ## 9. Models & Training History
 ### Training Job List
 - Job cards use #FFFFFF background
 - Running job:
  - Progress bar: #3A3A3A
 - Completed job:
  - Metrics bars in graphite
 ### Model Detail Panel
 - Sectioned cards
 - Metric bars:
  - Track: #ECEAE6
  - Fill: #3A3A3A
 Actions:
 - Primary: Download Model
 - Secondary: View Logs / Use as Base
 ---
 ## 10. Micro-interactions (Refined)
 | Element | Interaction | Animation |
 |------|------------|-----------|
 | Button hover | BG lightens | 150ms ease-out |
 | Button press | Scale 0.98 | 100ms |
 | Row hover | BG fade | 120ms |
 | Modal open | Fade + scale 0.96 → 1 | 200ms |
 | Progress fill | Smooth | ease-out |
 | Annotation select | Border + handles | 120ms |
 ---
 ## 11. Tailwind Theme (Updated)
 ```js
 colors: {
  text: {
    primary: '#121212',
    secondary: '#2A2A2A',
    muted: '#6B6B6B',
    disabled: '#9A9A9A',
  },
  bg: {
    app: '#FAFAF8',
    card: '#FFFFFF',
    hover: '#F1F0ED',
    selected: '#ECEAE6',
  },
  border: '#E6E4E1',
  accent: '#3A3A3A',
  success: '#3E4A3A',
  error: '#4A3A3A',
  warning: '#4A4A3A',
 }
 ```
 ---
 ## 12. Final Notes
 - Pure black (#000000) should **never** be used as large surfaces
 - Accent color usage should stay under **10% of UI area**
 - Warm grays are intentional and must not be "corrected" to blue-grays
 This theme is designed to scale from internal tool → polished SaaS without redesign.
--- a/docs/web-refactoring-complete.md
+++ b/docs/web-refactoring-complete.md
@@ -0,0 +1,273 @@
 # Web Directory Refactoring - Complete ✅
 **Date**: 2026-01-25
 **Status**: ✅ Completed
 **Tests**: 188 passing (0 failures)
 **Coverage**: 23% (maintained)
 ---
 ## Final Directory Structure
 ```
 src/web/
 ├── api/
 │   ├── __init__.py
 │   └── v1/
 │       ├── __init__.py
 │       ├── routes.py              # Public inference API
 │       ├── admin/
 │       │   ├── __init__.py
 │       │   ├── documents.py       # Document management (was admin_routes.py)
 │       │   ├── annotations.py     # Annotation routes (was admin_annotation_routes.py)
 │       │   └── training.py        # Training routes (was admin_training_routes.py)
 │       ├── async_api/
 │       │   ├── __init__.py
 │       │   └── routes.py          # Async processing API (was async_routes.py)
 │       └── batch/
 │           ├── __init__.py
 │           └── routes.py          # Batch upload API (was batch_upload_routes.py)
 │
 ├── schemas/
 │   ├── __init__.py
 │   ├── common.py                  # Shared models (ErrorResponse)
 │   ├── admin.py                   # Admin schemas (was admin_schemas.py)
 │   └── inference.py               # Inference + async schemas (was schemas.py)
 │
 ├── services/
 │   ├── __init__.py
 │   ├── inference.py               # Inference service (was services.py)
 │   ├── autolabel.py              # Auto-label service (was admin_autolabel.py)
 │   ├── async_processing.py       # Async processing (was async_service.py)
 │   └── batch_upload.py           # Batch upload service (was batch_upload_service.py)
 │
 ├── core/
 │   ├── __init__.py
 │   ├── auth.py                   # Authentication (was admin_auth.py)
 │   ├── rate_limiter.py           # Rate limiting (unchanged)
 │   └── scheduler.py              # Task scheduler (was admin_scheduler.py)
 │
 ├── workers/
 │   ├── __init__.py
 │   ├── async_queue.py            # Async task queue (was async_queue.py)
 │   └── batch_queue.py            # Batch task queue (was batch_queue.py)
 │
 ├── __init__.py                   # Main exports
 ├── app.py                        # FastAPI app (imports updated)
 ├── config.py                     # Configuration (unchanged)
 └── dependencies.py               # Global dependencies (unchanged)
 ```
 ---
 ## Changes Summary
 ### Files Moved and Renamed
 | Old Location | New Location | Change Type |
 |-------------|--------------|-------------|
 | `admin_routes.py` | `api/v1/admin/documents.py` | Moved + Renamed |
 | `admin_annotation_routes.py` | `api/v1/admin/annotations.py` | Moved + Renamed |
 | `admin_training_routes.py` | `api/v1/admin/training.py` | Moved + Renamed |
 | `admin_auth.py` | `core/auth.py` | Moved |
 | `admin_autolabel.py` | `services/autolabel.py` | Moved |
 | `admin_scheduler.py` | `core/scheduler.py` | Moved |
 | `admin_schemas.py` | `schemas/admin.py` | Moved |
 | `routes.py` | `api/v1/routes.py` | Moved |
 | `schemas.py` | `schemas/inference.py` | Moved |
 | `services.py` | `services/inference.py` | Moved |
 | `async_routes.py` | `api/v1/async_api/routes.py` | Moved |
 | `async_queue.py` | `workers/async_queue.py` | Moved |
 | `async_service.py` | `services/async_processing.py` | Moved + Renamed |
 | `batch_queue.py` | `workers/batch_queue.py` | Moved |
 | `batch_upload_routes.py` | `api/v1/batch/routes.py` | Moved |
 | `batch_upload_service.py` | `services/batch_upload.py` | Moved |
 **Total**: 16 files reorganized
 ### Files Updated
 **Source Files** (imports updated):
 - `app.py` - Updated all imports to new structure
 - `api/v1/admin/documents.py` - Updated schema/auth imports
 - `api/v1/admin/annotations.py` - Updated schema/service imports
 - `api/v1/admin/training.py` - Updated schema/auth imports
 - `api/v1/routes.py` - Updated schema imports
 - `api/v1/async_api/routes.py` - Updated schema imports
 - `api/v1/batch/routes.py` - Updated service/worker imports
 - `services/async_processing.py` - Updated worker/core imports
 **Test Files** (all 15 updated):
 - `test_admin_annotations.py`
 - `test_admin_auth.py`
 - `test_admin_routes.py`
 - `test_admin_routes_enhanced.py`
 - `test_admin_training.py`
 - `test_annotation_locks.py`
 - `test_annotation_phase5.py`
 - `test_async_queue.py`
 - `test_async_routes.py`
 - `test_async_service.py`
 - `test_autolabel_with_locks.py`
 - `test_batch_queue.py`
 - `test_batch_upload_routes.py`
 - `test_batch_upload_service.py`
 - `test_training_phase4.py`
 - `conftest.py`
 ---
 ## Import Examples
 ### Old Import Style (Before Refactoring)
 ```python
 from src.web.admin_routes import create_admin_router
 from src.web.admin_schemas import DocumentItem
 from src.web.admin_auth import validate_admin_token
 from src.web.async_routes import create_async_router
 from src.web.schemas import ErrorResponse
 ```
 ### New Import Style (After Refactoring)
 ```python
 # Admin API
 from src.web.api.v1.admin.documents import create_admin_router
 from src.web.api.v1.admin import create_admin_router  # Shorter alternative
 # Schemas
 from src.web.schemas.admin import DocumentItem
 from src.web.schemas.common import ErrorResponse
 # Core components
 from src.web.core.auth import validate_admin_token
 # Async API
 from src.web.api.v1.async_api.routes import create_async_router
 ```
 ---
 ## Benefits Achieved
 ### 1. **Clear Separation of Concerns**
 - **API Routes**: All in `api/v1/` by version and feature
 - **Data Models**: All in `schemas/` by domain
 - **Business Logic**: All in `services/`
 - **Core Components**: Reusable utilities in `core/`
 - **Background Jobs**: Task queues in `workers/`
 ### 2. **Better Scalability**
 - Easy to add API v2 without touching v1
 - Clear namespace for each module
 - Reduced file sizes (no 800+ line files)
 - Follows single responsibility principle
 ### 3. **Improved Maintainability**
 - Find files by function, not by prefix
 - Each module has one clear purpose
 - Easier to onboard new developers
 - Better IDE navigation
 ### 4. **Standards Compliance**
 - Follows FastAPI best practices
 - Matches Django/Flask project structures
 - Standard Python package organization
 - Industry-standard naming conventions
 ---
 ## Testing Results
 **Before Refactoring**:
 - 188 tests passing
 - 23% code coverage
 - Flat directory structure
 **After Refactoring**:
 - ✅ 188 tests passing (0 failures)
 - ✅ 23% code coverage (maintained)
 - ✅ Clean hierarchical structure
 - ✅ All imports updated
 - ✅ No backward compatibility shims needed
 ---
 ## Migration Statistics
 | Metric | Count |
 |--------|-------|
 | Files moved | 16 |
 | Directories created | 9 |
 | Files updated (source) | 8 |
 | Files updated (tests) | 16 |
 | Import statements updated | ~150 |
 | Lines of code changed | ~200 |
 | Tests broken | 0 |
 | Coverage lost | 0% |
 ---
 ## Code Diff Summary
 ```diff
 Before:
 src/web/
 ├── admin_routes.py (645 lines)
 ├── admin_annotation_routes.py (504 lines)
 ├── admin_training_routes.py (565 lines)
 ├── admin_auth.py (22 lines)
 ├── admin_schemas.py (262 lines)
 ... (15 more files at root level)
 After:
 src/web/
 ├── api/v1/
 │   ├── admin/ (3 route files)
 │   ├── async_api/ (1 route file)
 │   └── batch/ (1 route file)
 ├── schemas/ (3 schema files)
 ├── services/ (4 service files)
 ├── core/ (3 core files)
 └── workers/ (2 worker files)
 ```
 ---
 ## Next Steps (Optional)
 ### Phase 2: Documentation
 - [ ] Update API documentation with new import paths
 - [ ] Create migration guide for external developers
 - [ ] Update CLAUDE.md with new structure
 ### Phase 3: Further Optimization
 - [ ] Split large files (>400 lines) if needed
 - [ ] Extract common utilities
 - [ ] Add typing stubs
 ### Phase 4: Deprecation (Future)
 - [ ] Add deprecation warnings if creating compatibility layer
 - [ ] Remove old imports after grace period
 - [ ] Update all documentation
 ---
 ## Rollback Instructions
 If needed, rollback is simple:
 ```bash
 git revert <commit-hash>
 ```
 All changes are in version control, making rollback safe and easy.
 ---
 ## Conclusion
 ✅ **Refactoring completed successfully**
 ✅ **Zero breaking changes**
 ✅ **All tests passing**
 ✅ **Industry-standard structure achieved**
 The web directory is now organized following Python and FastAPI best practices, making it easier to scale, maintain, and extend.
--- a/docs/web-refactoring-plan.md
+++ b/docs/web-refactoring-plan.md
@@ -0,0 +1,186 @@
 # Web Directory Refactoring Plan
 ## Current Structure Issues
 1. **Flat structure**: All files in one directory (20 Python files)
 2. **Naming inconsistency**: Mix of `admin_*`, `async_*`, `batch_*` prefixes
 3. **Mixed concerns**: Routes, schemas, services, and workers in same directory
 4. **Poor scalability**: Hard to navigate and maintain as project grows
 ## Proposed Structure (Best Practices)
 ```
 src/web/
 ├── __init__.py                  # Main exports
 ├── app.py                       # FastAPI app factory
 ├── config.py                    # App configuration
 ├── dependencies.py              # Global dependencies
 │
 ├── api/                         # API Routes Layer
 │   ├── __init__.py
 │   └── v1/                      # API version 1
 │       ├── __init__.py
 │       ├── routes.py            # Public API routes (inference)
 │       ├── admin/               # Admin API routes
 │       │   ├── __init__.py
 │       │   ├── documents.py     # admin_routes.py → documents.py
 │       │   ├── annotations.py   # admin_annotation_routes.py → annotations.py
 │       │   ├── training.py      # admin_training_routes.py → training.py
 │       │   └── auth.py          # admin_auth.py → auth.py (routes only)
 │       ├── async_api/           # Async processing API
 │       │   ├── __init__.py
 │       │   └── routes.py        # async_routes.py → routes.py
 │       └── batch/               # Batch upload API
 │           ├── __init__.py
 │           └── routes.py        # batch_upload_routes.py → routes.py
 │
 ├── schemas/                     # Pydantic Models
 │   ├── __init__.py
 │   ├── common.py                # Shared schemas (ErrorResponse, etc.)
 │   ├── inference.py             # schemas.py → inference.py
 │   ├── admin.py                 # admin_schemas.py → admin.py
 │   ├── async_api.py             # New: async API schemas
 │   └── batch.py                 # New: batch upload schemas
 │
 ├── services/                    # Business Logic Layer
 │   ├── __init__.py
 │   ├── inference.py             # services.py → inference.py
 │   ├── autolabel.py             # admin_autolabel.py → autolabel.py
 │   ├── async_processing.py      # async_service.py → async_processing.py
 │   └── batch_upload.py          # batch_upload_service.py → batch_upload.py
 │
 ├── core/                        # Core Components
 │   ├── __init__.py
 │   ├── auth.py                  # admin_auth.py → auth.py (logic only)
 │   ├── rate_limiter.py          # rate_limiter.py → rate_limiter.py
 │   └── scheduler.py             # admin_scheduler.py → scheduler.py
 │
 └── workers/                     # Background Task Queues
    ├── __init__.py
    ├── async_queue.py           # async_queue.py → async_queue.py
    └── batch_queue.py           # batch_queue.py → batch_queue.py
 ```
 ## File Mapping
 ### Current → New Location
 | Current File | New Location | Purpose |
 |--------------|--------------|---------|
 | `admin_routes.py` | `api/v1/admin/documents.py` | Document management routes |
 | `admin_annotation_routes.py` | `api/v1/admin/annotations.py` | Annotation routes |
 | `admin_training_routes.py` | `api/v1/admin/training.py` | Training routes |
 | `admin_auth.py` | Split: `api/v1/admin/auth.py` + `core/auth.py` | Auth routes + logic |
 | `admin_schemas.py` | `schemas/admin.py` | Admin Pydantic models |
 | `admin_autolabel.py` | `services/autolabel.py` | Auto-label service |
 | `admin_scheduler.py` | `core/scheduler.py` | Training scheduler |
 | `routes.py` | `api/v1/routes.py` | Public inference API |
 | `schemas.py` | `schemas/inference.py` | Inference models |
 | `services.py` | `services/inference.py` | Inference service |
 | `async_routes.py` | `api/v1/async_api/routes.py` | Async API routes |
 | `async_service.py` | `services/async_processing.py` | Async processing service |
 | `async_queue.py` | `workers/async_queue.py` | Async task queue |
 | `batch_upload_routes.py` | `api/v1/batch/routes.py` | Batch upload routes |
 | `batch_upload_service.py` | `services/batch_upload.py` | Batch upload service |
 | `batch_queue.py` | `workers/batch_queue.py` | Batch task queue |
 | `rate_limiter.py` | `core/rate_limiter.py` | Rate limiting logic |
 | `config.py` | `config.py` | Keep as-is |
 | `dependencies.py` | `dependencies.py` | Keep as-is |
 | `app.py` | `app.py` | Keep as-is (update imports) |
 ## Benefits
 ### 1. Clear Separation of Concerns
 - **Routes**: API endpoint definitions
 - **Schemas**: Data validation models
 - **Services**: Business logic
 - **Core**: Reusable components
 - **Workers**: Background processing
 ### 2. Better Scalability
 - Easy to add new API versions (`v2/`)
 - Clear namespace for each domain
 - Reduced file size (no 800+ line files)
 ### 3. Improved Maintainability
 - Find files by function, not by prefix
 - Each module has single responsibility
 - Easier to write focused tests
 ### 4. Standard Python Patterns
 - Package-based organization
 - Follows FastAPI best practices
 - Similar to Django/Flask structures
 ## Implementation Steps
 ### Phase 1: Create New Structure (No Breaking Changes)
 1. Create new directories: `api/`, `schemas/`, `services/`, `core/`, `workers/`
 2. Copy files to new locations (don't delete originals yet)
 3. Update imports in new files
 4. Add `__init__.py` with proper exports
 ### Phase 2: Update Tests
 5. Update test imports to use new structure
 6. Run tests to verify nothing breaks
 7. Fix any import issues
 ### Phase 3: Update Main App
 8. Update `app.py` to import from new locations
 9. Run full test suite
 10. Verify all endpoints work
 ### Phase 4: Cleanup
 11. Delete old files
 12. Update documentation
 13. Final test run
 ## Migration Priority
 **High Priority** (Most used):
 - Routes and schemas (user-facing APIs)
 - Services (core business logic)
 **Medium Priority**:
 - Core components (auth, rate limiter)
 - Workers (background tasks)
 **Low Priority**:
 - Config and dependencies (already well-located)
 ## Backwards Compatibility
 During migration, maintain backwards compatibility:
 ```python
 # src/web/__init__.py
 # Old imports still work
 from src.web.api.v1.admin.documents import router as admin_router
 from src.web.schemas.admin import AdminDocument
 # Keep old names for compatibility (temporary)
 admin_routes = admin_router  # Deprecated alias
 ```
 ## Testing Strategy
 1. **Unit Tests**: Test each module independently
 2. **Integration Tests**: Test API endpoints still work
 3. **Import Tests**: Verify all old imports still work
 4. **Coverage**: Maintain current 23% coverage minimum
 ## Rollback Plan
 If issues arise:
 1. Keep old files until fully migrated
 2. Git allows easy revert
 3. Tests catch breaking changes early
 ---
 ## Next Steps
 Would you like me to:
 1. **Start Phase 1**: Create new directory structure and move files?
 2. **Create migration script**: Automate the file moves and import updates?
 3. **Focus on specific area**: Start with admin API or async API first?
--- a/docs/web-refactoring-status.md
+++ b/docs/web-refactoring-status.md
@@ -0,0 +1,218 @@
 # Web Directory Refactoring - Current Status
 ## ✅ Completed Steps
 ### 1. Directory Structure Created
 ```
 src/web/
 ├── api/
 │   ├── v1/
 │   │   ├── admin/      (documents.py, annotations.py, training.py)
 │   │   ├── async_api/  (routes.py)
 │   │   ├── batch/      (routes.py)
 │   │   └── routes.py   (public inference API)
 ├── schemas/
 │   ├── admin.py        (admin schemas)
 │   ├── inference.py    (inference + async schemas)
 │   └── common.py       (ErrorResponse)
 ├── services/
 │   ├── autolabel.py
 │   ├── async_processing.py
 │   ├── batch_upload.py
 │   └── inference.py
 ├── core/
 │   ├── auth.py
 │   ├── rate_limiter.py
 │   └── scheduler.py
 └── workers/
    ├── async_queue.py
    └── batch_queue.py
 ```
 ### 2. Files Copied and Imports Updated
 #### Admin API (✅ Complete)
 - [x] `admin_routes.py` → `api/v1/admin/documents.py` (imports updated)
 - [x] `admin_annotation_routes.py` → `api/v1/admin/annotations.py` (imports updated)
 - [x] `admin_training_routes.py` → `api/v1/admin/training.py` (imports updated)
 - [x] `api/v1/admin/__init__.py` created with exports
 #### Public & Async API (✅ Complete)
 - [x] `routes.py` → `api/v1/routes.py` (imports updated)
 - [x] `async_routes.py` → `api/v1/async_api/routes.py` (imports updated)
 - [x] `batch_upload_routes.py` → `api/v1/batch/routes.py` (copied, imports pending)
 #### Schemas (✅ Complete)
 - [x] `admin_schemas.py` → `schemas/admin.py`
 - [x] `schemas.py` → `schemas/inference.py`
 - [x] `schemas/common.py` created
 - [x] `schemas/__init__.py` created with exports
 #### Services (✅ Complete)
 - [x] `admin_autolabel.py` → `services/autolabel.py`
 - [x] `async_service.py` → `services/async_processing.py`
 - [x] `batch_upload_service.py` → `services/batch_upload.py`
 - [x] `services.py` → `services/inference.py`
 - [x] `services/__init__.py` created
 #### Core Components (✅ Complete)
 - [x] `admin_auth.py` → `core/auth.py`
 - [x] `rate_limiter.py` → `core/rate_limiter.py`
 - [x] `admin_scheduler.py` → `core/scheduler.py`
 - [x] `core/__init__.py` created
 #### Workers (✅ Complete)
 - [x] `async_queue.py` → `workers/async_queue.py`
 - [x] `batch_queue.py` → `workers/batch_queue.py`
 - [x] `workers/__init__.py` created
 #### Main App (✅ Complete)
 - [x] `app.py` imports updated to use new structure
 ---
 ## ⏳ Remaining Work
 ### 1. Update Remaining File Imports (HIGH PRIORITY)
 Files that need import updates:
 - [ ] `api/v1/batch/routes.py` - update to use new schema/service imports
 - [ ] `services/autolabel.py` - may need import updates if it references old paths
 - [ ] `services/async_processing.py` - check for old import references
 - [ ] `services/batch_upload.py` - check for old import references
 - [ ] `services/inference.py` - check for old import references
 ### 2. Update ALL Test Files (CRITICAL)
 Test files need to import from new locations. Pattern:
 **Old:**
 ```python
 from src.web.admin_routes import create_admin_router
 from src.web.admin_schemas import DocumentItem
 from src.web.admin_auth import validate_admin_token
 ```
 **New:**
 ```python
 from src.web.api.v1.admin import create_admin_router
 from src.web.schemas.admin import DocumentItem
 from src.web.core.auth import validate_admin_token
 ```
 Test files to update:
 - [ ] `tests/web/test_admin_annotations.py`
 - [ ] `tests/web/test_admin_auth.py`
 - [ ] `tests/web/test_admin_routes.py`
 - [ ] `tests/web/test_admin_routes_enhanced.py`
 - [ ] `tests/web/test_admin_training.py`
 - [ ] `tests/web/test_annotation_locks.py`
 - [ ] `tests/web/test_annotation_phase5.py`
 - [ ] `tests/web/test_async_queue.py`
 - [ ] `tests/web/test_async_routes.py`
 - [ ] `tests/web/test_async_service.py`
 - [ ] `tests/web/test_autolabel_with_locks.py`
 - [ ] `tests/web/test_batch_queue.py`
 - [ ] `tests/web/test_batch_upload_routes.py`
 - [ ] `tests/web/test_batch_upload_service.py`
 - [ ] `tests/web/test_rate_limiter.py`
 - [ ] `tests/web/test_training_phase4.py`
 ### 3. Create Backward Compatibility Layer (OPTIONAL)
 Keep old imports working temporarily:
 ```python
 # src/web/admin_routes.py (temporary compatibility shim)
 \"\"\"
 DEPRECATED: Use src.web.api.v1.admin.documents instead.
 This file will be removed in next version.
 \"\"\"
 import warnings
 from src.web.api.v1.admin.documents import *
 warnings.warn(
    "Importing from src.web.admin_routes is deprecated. "
    "Use src.web.api.v1.admin.documents instead.",
    DeprecationWarning,
    stacklevel=2
 )
 ```
 ### 4. Verify and Test
 1. Run tests:
 ```bash
 pytest tests/web/ -v
 ```
 2. Check for any import errors:
 ```bash
 python -c "from src.web.app import create_app; create_app()"
 ```
 3. Start server and test endpoints:
 ```bash
 python run_server.py
 ```
 ### 5. Clean Up Old Files (ONLY AFTER TESTS PASS)
 Old files to remove:
 - `src/web/admin_*.py` (7 files)
 - `src/web/async_*.py` (3 files)
 - `src/web/batch_*.py` (3 files)
 - `src/web/routes.py`
 - `src/web/services.py`
 - `src/web/schemas.py`
 - `src/web/rate_limiter.py`
 Keep these files (don't remove):
 - `src/web/__init__.py`
 - `src/web/app.py`
 - `src/web/config.py`
 - `src/web/dependencies.py`
 ---
 ## 🎯 Next Immediate Steps
 1. **Update batch/routes.py imports** - Quick fix for remaining API route
 2. **Update test file imports** - Critical for verification
 3. **Run test suite** - Verify nothing broke
 4. **Fix any import errors** - Address failures
 5. **Remove old files** - Clean up after tests pass
 ---
 ## 📊 Migration Impact Summary
 | Category | Files Moved | Imports Updated | Status |
 |----------|-------------|-----------------|--------|
 | API Routes | 7 | 5/7 | 🟡 In Progress |
 | Schemas | 3 | 3/3 | ✅ Complete |
 | Services | 4 | 0/4 | ⚠️ Pending |
 | Core | 3 | 3/3 | ✅ Complete |
 | Workers | 2 | 2/2 | ✅ Complete |
 | Tests | 0 | 0/16 | ❌ Not Started |
 **Overall Progress: 65%**
 ---
 ## 🚀 Benefits After Migration
 1. **Better Organization**: Clear separation by function
 2. **Easier Navigation**: Find files by purpose, not prefix
 3. **Scalability**: Easy to add new API versions
 4. **Standard Structure**: Follows FastAPI best practices
 5. **Maintainability**: Each module has single responsibility
 ---
 ## 📝 Notes
 - All original files are still in place (no data loss risk)
 - New structure is operational but needs import updates
 - Backward compatibility can be added if needed
 - Tests will validate the migration success
--- a/frontend/.env.example
+++ b/frontend/.env.example
@@ -0,0 +1,5 @@
 # Backend API URL
 VITE_API_URL=http://localhost:8000
 # WebSocket URL (for future real-time updates)
 VITE_WS_URL=ws://localhost:8000/ws
--- a/frontend/.gitignore
+++ b/frontend/.gitignore
@@ -0,0 +1,24 @@
 # Logs
 logs
 *.log
 npm-debug.log*
 yarn-debug.log*
 yarn-error.log*
 pnpm-debug.log*
 lerna-debug.log*
 node_modules
 dist
 dist-ssr
 *.local
 # Editor directories and files
 .vscode/*
 !.vscode/extensions.json
 .idea
 .DS_Store
 *.suo
 *.ntvs*
 *.njsproj
 *.sln
 *.sw?
--- a/frontend/README.md
+++ b/frontend/README.md
@@ -0,0 +1,20 @@
 <div align="center">
 <img width="1200" height="475" alt="GHBanner" src="https://github.com/user-attachments/assets/0aa67016-6eaf-458a-adb2-6e31a0763ed6" />
 </div>
 # Run and deploy your AI Studio app
 This contains everything you need to run your app locally.
 View your app in AI Studio: https://ai.studio/apps/drive/13hqd80ft4g_LngMYB8LLJxx2XU8C_eI4
 ## Run Locally
 **Prerequisites:**  Node.js
 1. Install dependencies:
   `npm install`
 2. Set the `GEMINI_API_KEY` in [.env.local](.env.local) to your Gemini API key
 3. Run the app:
   `npm run dev`
--- a/frontend/REFACTORING_PLAN.md
+++ b/frontend/REFACTORING_PLAN.md
@@ -0,0 +1,240 @@
 # Frontend Refactoring Plan
 ## Current Structure Issues
 1. **Flat component organization** - All components in one directory
 2. **Mock data only** - No real API integration
 3. **No state management** - Props drilling everywhere
 4. **CDN dependencies** - Should use npm packages
 5. **Manual routing** - Using useState instead of react-router
 6. **No TypeScript integration with backend** - Types don't match API schemas
 ## Recommended Structure
 ```
 frontend/
 ├── public/
 │   └── favicon.ico
 │
 ├── src/
 │   ├── api/                           # API Layer
 │   │   ├── client.ts                  # Axios instance + interceptors
 │   │   ├── types.ts                   # API request/response types
 │   │   └── endpoints/
 │   │       ├── documents.ts           # GET /api/v1/admin/documents
 │   │       ├── annotations.ts         # GET/POST /api/v1/admin/documents/{id}/annotations
 │   │       ├── training.ts            # GET/POST /api/v1/admin/training/*
 │   │       ├── inference.ts           # POST /api/v1/infer
 │   │       └── async.ts               # POST /api/v1/async/submit
 │   │
 │   ├── components/
 │   │   ├── common/                    # Reusable components
 │   │   │   ├── Badge.tsx
 │   │   │   ├── Button.tsx
 │   │   │   ├── Input.tsx
 │   │   │   ├── Modal.tsx
 │   │   │   ├── Table.tsx
 │   │   │   ├── ProgressBar.tsx
 │   │   │   └── StatusBadge.tsx
 │   │   │
 │   │   ├── layout/                    # Layout components
 │   │   │   ├── TopNav.tsx
 │   │   │   ├── Sidebar.tsx
 │   │   │   └── PageHeader.tsx
 │   │   │
 │   │   ├── documents/                 # Document-specific components
 │   │   │   ├── DocumentTable.tsx
 │   │   │   ├── DocumentFilters.tsx
 │   │   │   ├── DocumentRow.tsx
 │   │   │   ├── UploadModal.tsx
 │   │   │   └── BatchUploadModal.tsx
 │   │   │
 │   │   ├── annotations/               # Annotation components
 │   │   │   ├── AnnotationCanvas.tsx
 │   │   │   ├── AnnotationBox.tsx
 │   │   │   ├── AnnotationTable.tsx
 │   │   │   ├── FieldEditor.tsx
 │   │   │   └── VerificationPanel.tsx
 │   │   │
 │   │   └── training/                  # Training components
 │   │       ├── DocumentSelector.tsx
 │   │       ├── TrainingConfig.tsx
 │   │       ├── TrainingJobList.tsx
 │   │       ├── ModelCard.tsx
 │   │       └── MetricsChart.tsx
 │   │
 │   ├── pages/                         # Page-level components
 │   │   ├── DocumentsPage.tsx          # Was Dashboard.tsx
 │   │   ├── DocumentDetailPage.tsx     # Was DocumentDetail.tsx
 │   │   ├── TrainingPage.tsx           # Was Training.tsx
 │   │   ├── ModelsPage.tsx             # Was Models.tsx
 │   │   └── InferencePage.tsx          # New: Test inference
 │   │
 │   ├── hooks/                         # Custom React Hooks
 │   │   ├── useDocuments.ts            # Document CRUD + listing
 │   │   ├── useAnnotations.ts          # Annotation management
 │   │   ├── useTraining.ts             # Training jobs
 │   │   ├── usePolling.ts              # Auto-refresh for async jobs
 │   │   └── useDebounce.ts             # Debounce search inputs
 │   │
 │   ├── store/                         # State Management (Zustand)
 │   │   ├── documentsStore.ts
 │   │   ├── annotationsStore.ts
 │   │   ├── trainingStore.ts
 │   │   └── uiStore.ts
 │   │
 │   ├── types/                         # TypeScript Types
 │   │   ├── index.ts
 │   │   ├── document.ts
 │   │   ├── annotation.ts
 │   │   ├── training.ts
 │   │   └── api.ts
 │   │
 │   ├── utils/                         # Utility Functions
 │   │   ├── formatters.ts              # Date, currency, etc.
 │   │   ├── validators.ts              # Form validation
 │   │   └── constants.ts               # Field definitions, statuses
 │   │
 │   ├── styles/
 │   │   └── index.css                  # Tailwind entry
 │   │
 │   ├── App.tsx
 │   ├── main.tsx
 │   └── router.tsx                     # React Router config
 │
 ├── .env.example
 ├── package.json
 ├── tsconfig.json
 ├── vite.config.ts
 ├── tailwind.config.js
 ├── postcss.config.js
 └── index.html
 ```
 ## Migration Steps
 ### Phase 1: Setup Infrastructure
 - [ ] Install dependencies (axios, react-router, zustand, @tanstack/react-query)
 - [ ] Setup local Tailwind (remove CDN)
 - [ ] Create API client with interceptors
 - [ ] Add environment variables (.env.local with VITE_API_URL)
 ### Phase 2: Create API Layer
 - [ ] Create `src/api/client.ts` with axios instance
 - [ ] Create `src/api/endpoints/documents.ts` matching backend API
 - [ ] Create `src/api/endpoints/annotations.ts`
 - [ ] Create `src/api/endpoints/training.ts`
 - [ ] Add types matching backend schemas
 ### Phase 3: Reorganize Components
 - [ ] Move existing components to new structure
 - [ ] Split large components (Dashboard > DocumentTable + DocumentFilters + DocumentRow)
 - [ ] Extract reusable components (Badge, Button already done)
 - [ ] Create layout components (TopNav, Sidebar)
 ### Phase 4: Add Routing
 - [ ] Install react-router-dom
 - [ ] Create router.tsx with routes
 - [ ] Update App.tsx to use RouterProvider
 - [ ] Add navigation links
 ### Phase 5: State Management
 - [ ] Create custom hooks (useDocuments, useAnnotations)
 - [ ] Use @tanstack/react-query for server state
 - [ ] Add Zustand stores for UI state
 - [ ] Replace mock data with API calls
 ### Phase 6: Backend Integration
 - [ ] Update CORS settings in backend
 - [ ] Test all API endpoints
 - [ ] Add error handling
 - [ ] Add loading states
 ## Dependencies to Add
 ```json
 {
  "dependencies": {
    "react-router-dom": "^6.22.0",
    "axios": "^1.6.7",
    "zustand": "^4.5.0",
    "@tanstack/react-query": "^5.20.0",
    "date-fns": "^3.3.0",
    "clsx": "^2.1.0"
  },
  "devDependencies": {
    "tailwindcss": "^3.4.1",
    "autoprefixer": "^10.4.17",
    "postcss": "^8.4.35"
  }
 }
 ```
 ## Configuration Files to Create
 ### tailwind.config.js
 ```javascript
 export default {
  content: ['./index.html', './src/**/*.{js,ts,jsx,tsx}'],
  theme: {
    extend: {
      colors: {
        warm: {
          bg: '#FAFAF8',
          card: '#FFFFFF',
          hover: '#F1F0ED',
          selected: '#ECEAE6',
          border: '#E6E4E1',
          divider: '#D8D6D2',
          text: {
            primary: '#121212',
            secondary: '#2A2A2A',
            muted: '#6B6B6B',
            disabled: '#9A9A9A',
          },
          state: {
            success: '#3E4A3A',
            error: '#4A3A3A',
            warning: '#4A4A3A',
            info: '#3A3A3A',
          }
        }
      }
    }
  }
 }
 ```
 ### .env.example
 ```bash
 VITE_API_URL=http://localhost:8000
 VITE_WS_URL=ws://localhost:8000/ws
 ```
 ## Type Generation from Backend
 Consider generating TypeScript types from Python Pydantic schemas:
 - Option 1: Use `datamodel-code-generator` to convert schemas
 - Option 2: Manually maintain types in `src/types/api.ts`
 - Option 3: Use OpenAPI spec + openapi-typescript-codegen
 ## Testing Strategy
 - Unit tests: Vitest for components
 - Integration tests: React Testing Library
 - E2E tests: Playwright (matching backend)
 ## Performance Considerations
 - Code splitting by route
 - Lazy load heavy components (AnnotationCanvas)
 - Optimize re-renders with React.memo
 - Use virtual scrolling for large tables
 - Image lazy loading for document previews
 ## Accessibility
 - Proper ARIA labels
 - Keyboard navigation
 - Focus management
 - Color contrast compliance (already done with Warm Graphite theme)
--- a/frontend/SETUP.md
+++ b/frontend/SETUP.md
@@ -0,0 +1,256 @@
 # Frontend Setup Guide
 ## Quick Start
 ### 1. Install Dependencies
 ```bash
 cd frontend
 npm install
 ```
 ### 2. Configure Environment
 Copy `.env.example` to `.env.local` and update if needed:
 ```bash
 cp .env.example .env.local
 ```
 Default configuration:
 ```
 VITE_API_URL=http://localhost:8000
 VITE_WS_URL=ws://localhost:8000/ws
 ```
 ### 3. Start Backend API
 Make sure the backend is running first:
 ```bash
 # From project root
 wsl bash -c "source ~/miniconda3/etc/profile.d/conda.sh && conda activate invoice-py311 && python run_server.py"
 ```
 Backend will be available at: http://localhost:8000
 ### 4. Start Frontend Dev Server
 ```bash
 cd frontend
 npm run dev
 ```
 Frontend will be available at: http://localhost:3000
 ## Project Structure
 ```
 frontend/
 ├── src/
 │   ├── api/                    # API client layer
 │   │   ├── client.ts          # Axios instance with interceptors
 │   │   ├── types.ts           # API type definitions
 │   │   └── endpoints/
 │   │       ├── documents.ts   # Document API calls
 │   │       ├── annotations.ts # Annotation API calls
 │   │       └── training.ts    # Training API calls
 │   │
 │   ├── components/            # React components
 │   │   └── Dashboard.tsx      # Updated with real API integration
 │   │
 │   ├── hooks/                 # Custom React Hooks
 │   │   ├── useDocuments.ts
 │   │   ├── useDocumentDetail.ts
 │   │   ├── useAnnotations.ts
 │   │   └── useTraining.ts
 │   │
 │   ├── styles/
 │   │   └── index.css         # Tailwind CSS entry
 │   │
 │   ├── App.tsx
 │   └── main.tsx              # App entry point with QueryClient
 │
 ├── components/               # Legacy components (to be migrated)
 │   ├── Badge.tsx
 │   ├── Button.tsx
 │   ├── Layout.tsx
 │   ├── DocumentDetail.tsx
 │   ├── Training.tsx
 │   ├── Models.tsx
 │   └── UploadModal.tsx
 │
 ├── tailwind.config.js        # Tailwind configuration
 ├── postcss.config.js
 ├── vite.config.ts
 ├── package.json
 └── index.html
 ```
 ## Key Technologies
 - **React 19** - UI framework
 - **TypeScript** - Type safety
 - **Vite** - Build tool
 - **Tailwind CSS** - Styling (Warm Graphite theme)
 - **Axios** - HTTP client
 - **@tanstack/react-query** - Server state management
 - **lucide-react** - Icon library
 ## API Integration
 ### Authentication
 The app stores admin token in localStorage:
 ```typescript
 localStorage.setItem('admin_token', 'your-token')
 ```
 All API requests automatically include the `X-Admin-Token` header.
 ### Available Hooks
 #### useDocuments
 ```typescript
 const {
  documents,
  total,
  isLoading,
  uploadDocument,
  deleteDocument,
  triggerAutoLabel,
 } = useDocuments({ status: 'labeled', limit: 20 })
 ```
 #### useDocumentDetail
 ```typescript
 const { document, annotations, isLoading } = useDocumentDetail(documentId)
 ```
 #### useAnnotations
 ```typescript
 const {
  createAnnotation,
  updateAnnotation,
  deleteAnnotation,
  verifyAnnotation,
  overrideAnnotation,
 } = useAnnotations(documentId)
 ```
 #### useTraining
 ```typescript
 const {
  models,
  isLoadingModels,
  startTraining,
  downloadModel,
 } = useTraining()
 ```
 ## Features Implemented
 ### Phase 1 (Completed)
 - ✅ API client with axios interceptors
 - ✅ Type-safe API endpoints
 - ✅ React Query for server state
 - ✅ Custom hooks for all APIs
 - ✅ Dashboard with real data
 - ✅ Local Tailwind CSS
 - ✅ Environment configuration
 - ✅ CORS configured in backend
 ### Phase 2 (TODO)
 - [ ] Update DocumentDetail to use useDocumentDetail
 - [ ] Update Training page to use useTraining hooks
 - [ ] Update Models page with real data
 - [ ] Add UploadModal integration with API
 - [ ] Add react-router for proper routing
 - [ ] Add error boundary
 - [ ] Add loading states
 - [ ] Add toast notifications
 ### Phase 3 (TODO)
 - [ ] Annotation canvas with real data
 - [ ] Batch upload functionality
 - [ ] Auto-label progress polling
 - [ ] Training job monitoring
 - [ ] Model download functionality
 - [ ] Search and filtering
 - [ ] Pagination
 ## Development Tips
 ### Hot Module Replacement
 Vite supports HMR. Changes will reflect immediately without page reload.
 ### API Debugging
 Check browser console for API requests:
 - Network tab shows all requests/responses
 - Axios interceptors log errors automatically
 ### Type Safety
 TypeScript types in `src/api/types.ts` match backend Pydantic schemas.
 To regenerate types from backend:
 ```bash
 # TODO: Add type generation script
 ```
 ### Backend API Documentation
 Visit http://localhost:8000/docs for interactive API documentation (Swagger UI).
 ## Troubleshooting
 ### CORS Errors
 If you see CORS errors:
 1. Check backend is running at http://localhost:8000
 2. Verify CORS settings in `src/web/app.py`
 3. Check `.env.local` has correct `VITE_API_URL`
 ### Module Not Found
 If imports fail:
 ```bash
 rm -rf node_modules package-lock.json
 npm install
 ```
 ### Types Not Matching
 If API responses don't match types:
 1. Check backend version is up-to-date
 2. Verify types in `src/api/types.ts`
 3. Check API response in Network tab
 ## Next Steps
 1. Run `npm install` to install dependencies
 2. Start backend server
 3. Run `npm run dev` to start frontend
 4. Open http://localhost:3000
 5. Create an admin token via backend API
 6. Store token in localStorage via browser console:
   ```javascript
   localStorage.setItem('admin_token', 'your-token-here')
   ```
 7. Refresh page to see authenticated API calls
 ## Production Build
 ```bash
 npm run build
 npm run preview  # Preview production build
 ```
 Build output will be in `dist/` directory.
--- a/frontend/index.html
+++ b/frontend/index.html
@@ -0,0 +1,15 @@
 <!DOCTYPE html>
 <html lang="en">
  <head>
    <meta charset="UTF-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <title>Graphite Annotator - Invoice Field Extraction</title>
    <link rel="preconnect" href="https://fonts.googleapis.com">
    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet">
  </head>
  <body>
    <div id="root"></div>
    <script type="module" src="/src/main.tsx"></script>
  </body>
 </html>
--- a/frontend/metadata.json
+++ b/frontend/metadata.json
@@ -0,0 +1,5 @@
 {
  "name": "Graphite Annotator",
  "description": "A professional, warm graphite themed document annotation and training tool for enterprise use cases.",
  "requestFramePermissions": []
 }
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -0,0 +1,32 @@
 {
  "name": "graphite-annotator",
  "private": true,
  "version": "0.0.0",
  "type": "module",
  "scripts": {
    "dev": "vite",
    "build": "vite build",
    "preview": "vite preview"
  },
  "dependencies": {
    "react": "^19.2.3",
    "react-dom": "^19.2.3",
    "lucide-react": "^0.563.0",
    "recharts": "^3.7.0",
    "axios": "^1.6.7",
    "react-router-dom": "^6.22.0",
    "zustand": "^4.5.0",
    "@tanstack/react-query": "^5.20.0",
    "date-fns": "^3.3.0",
    "clsx": "^2.1.0"
  },
  "devDependencies": {
    "@types/node": "^22.14.0",
    "@vitejs/plugin-react": "^5.0.0",
    "typescript": "~5.8.2",
    "vite": "^6.2.0",
    "tailwindcss": "^3.4.1",
    "autoprefixer": "^10.4.17",
    "postcss": "^8.4.35"
  }
 }
--- a/frontend/postcss.config.js
+++ b/frontend/postcss.config.js
@@ -0,0 +1,6 @@
 export default {
  plugins: {
    tailwindcss: {},
    autoprefixer: {},
  },
 }
--- a/frontend/src/App.tsx
+++ b/frontend/src/App.tsx
@@ -0,0 +1,73 @@
 import React, { useState, useEffect } from 'react'
 import { Layout } from './components/Layout'
 import { DashboardOverview } from './components/DashboardOverview'
 import { Dashboard } from './components/Dashboard'
 import { DocumentDetail } from './components/DocumentDetail'
 import { Training } from './components/Training'
 import { Models } from './components/Models'
 import { Login } from './components/Login'
 import { InferenceDemo } from './components/InferenceDemo'
 const App: React.FC = () => {
  const [currentView, setCurrentView] = useState('dashboard')
  const [selectedDocId, setSelectedDocId] = useState<string | null>(null)
  const [isAuthenticated, setIsAuthenticated] = useState(false)
  useEffect(() => {
    const token = localStorage.getItem('admin_token')
    setIsAuthenticated(!!token)
  }, [])
  const handleNavigate = (view: string, docId?: string) => {
    setCurrentView(view)
    if (docId) {
      setSelectedDocId(docId)
    }
  }
  const handleLogin = (token: string) => {
    setIsAuthenticated(true)
  }
  const handleLogout = () => {
    localStorage.removeItem('admin_token')
    setIsAuthenticated(false)
    setCurrentView('documents')
  }
  if (!isAuthenticated) {
    return <Login onLogin={handleLogin} />
  }
  const renderContent = () => {
    switch (currentView) {
      case 'dashboard':
        return <DashboardOverview onNavigate={handleNavigate} />
      case 'documents':
        return <Dashboard onNavigate={handleNavigate} />
      case 'detail':
        return (
          <DocumentDetail
            docId={selectedDocId || '1'}
            onBack={() => setCurrentView('documents')}
          />
        )
      case 'demo':
        return <InferenceDemo />
      case 'training':
        return <Training />
      case 'models':
        return <Models />
      default:
        return <DashboardOverview onNavigate={handleNavigate} />
    }
  }
  return (
    <Layout activeView={currentView} onNavigate={handleNavigate} onLogout={handleLogout}>
      {renderContent()}
    </Layout>
  )
 }
 export default App
--- a/frontend/src/api/client.ts
+++ b/frontend/src/api/client.ts
@@ -0,0 +1,41 @@
 import axios, { AxiosInstance, AxiosError } from 'axios'
 const apiClient: AxiosInstance = axios.create({
  baseURL: import.meta.env.VITE_API_URL || 'http://localhost:8000',
  headers: {
    'Content-Type': 'application/json',
  },
  timeout: 30000,
 })
 apiClient.interceptors.request.use(
  (config) => {
    const token = localStorage.getItem('admin_token')
    if (token) {
      config.headers['X-Admin-Token'] = token
    }
    return config
  },
  (error) => {
    return Promise.reject(error)
  }
 )
 apiClient.interceptors.response.use(
  (response) => response,
  (error: AxiosError) => {
    if (error.response?.status === 401) {
      console.warn('Authentication required. Please set admin_token in localStorage.')
      // Don't redirect to avoid infinite loop
      // User should manually set: localStorage.setItem('admin_token', 'your-token')
    }
    if (error.response?.status === 429) {
      console.error('Rate limit exceeded')
    }
    return Promise.reject(error)
  }
 )
 export default apiClient
--- a/frontend/src/api/endpoints/annotations.ts
+++ b/frontend/src/api/endpoints/annotations.ts
@@ -0,0 +1,66 @@
 import apiClient from '../client'
 import type {
  AnnotationItem,
  CreateAnnotationRequest,
  AnnotationOverrideRequest,
 } from '../types'
 export const annotationsApi = {
  list: async (documentId: string): Promise<AnnotationItem[]> => {
    const { data } = await apiClient.get(
      `/api/v1/admin/documents/${documentId}/annotations`
    )
    return data.annotations
  },
  create: async (
    documentId: string,
    annotation: CreateAnnotationRequest
  ): Promise<AnnotationItem> => {
    const { data } = await apiClient.post(
      `/api/v1/admin/documents/${documentId}/annotations`,
      annotation
    )
    return data
  },
  update: async (
    documentId: string,
    annotationId: string,
    updates: Partial<CreateAnnotationRequest>
  ): Promise<AnnotationItem> => {
    const { data } = await apiClient.patch(
      `/api/v1/admin/documents/${documentId}/annotations/${annotationId}`,
      updates
    )
    return data
  },
  delete: async (documentId: string, annotationId: string): Promise<void> => {
    await apiClient.delete(
      `/api/v1/admin/documents/${documentId}/annotations/${annotationId}`
    )
  },
  verify: async (
    documentId: string,
    annotationId: string
  ): Promise<{ annotation_id: string; is_verified: boolean; message: string }> => {
    const { data } = await apiClient.post(
      `/api/v1/admin/documents/${documentId}/annotations/${annotationId}/verify`
    )
    return data
  },
  override: async (
    documentId: string,
    annotationId: string,
    overrideData: AnnotationOverrideRequest
  ): Promise<{ annotation_id: string; source: string; message: string }> => {
    const { data } = await apiClient.patch(
      `/api/v1/admin/documents/${documentId}/annotations/${annotationId}/override`,
      overrideData
    )
    return data
  },
 }
--- a/frontend/src/api/endpoints/documents.ts
+++ b/frontend/src/api/endpoints/documents.ts
@@ -0,0 +1,80 @@
 import apiClient from '../client'
 import type {
  DocumentListResponse,
  DocumentDetailResponse,
  DocumentItem,
  UploadDocumentResponse,
 } from '../types'
 export const documentsApi = {
  list: async (params?: {
    status?: string
    limit?: number
    offset?: number
  }): Promise<DocumentListResponse> => {
    const { data } = await apiClient.get('/api/v1/admin/documents', { params })
    return data
  },
  getDetail: async (documentId: string): Promise<DocumentDetailResponse> => {
    const { data } = await apiClient.get(`/api/v1/admin/documents/${documentId}`)
    return data
  },
  upload: async (file: File): Promise<UploadDocumentResponse> => {
    const formData = new FormData()
    formData.append('file', file)
    const { data } = await apiClient.post('/api/v1/admin/documents', formData, {
      headers: {
        'Content-Type': 'multipart/form-data',
      },
    })
    return data
  },
  batchUpload: async (
    files: File[],
    csvFile?: File
  ): Promise<{ batch_id: string; message: string; documents_created: number }> => {
    const formData = new FormData()
    files.forEach((file) => {
      formData.append('files', file)
    })
    if (csvFile) {
      formData.append('csv_file', csvFile)
    }
    const { data } = await apiClient.post('/api/v1/admin/batch/upload', formData, {
      headers: {
        'Content-Type': 'multipart/form-data',
      },
    })
    return data
  },
  delete: async (documentId: string): Promise<void> => {
    await apiClient.delete(`/api/v1/admin/documents/${documentId}`)
  },
  updateStatus: async (
    documentId: string,
    status: string
  ): Promise<DocumentItem> => {
    const { data } = await apiClient.patch(
      `/api/v1/admin/documents/${documentId}/status`,
      null,
      { params: { status } }
    )
    return data
  },
  triggerAutoLabel: async (documentId: string): Promise<{ message: string }> => {
    const { data } = await apiClient.post(
      `/api/v1/admin/documents/${documentId}/auto-label`
    )
    return data
  },
 }
--- a/frontend/src/api/endpoints/index.ts
+++ b/frontend/src/api/endpoints/index.ts
@@ -0,0 +1,4 @@
 export { documentsApi } from './documents'
 export { annotationsApi } from './annotations'
 export { trainingApi } from './training'
 export { inferenceApi } from './inference'
--- a/frontend/src/api/endpoints/inference.ts
+++ b/frontend/src/api/endpoints/inference.ts
@@ -0,0 +1,16 @@
 import apiClient from '../client'
 import type { InferenceResponse } from '../types'
 export const inferenceApi = {
  processDocument: async (file: File): Promise<InferenceResponse> => {
    const formData = new FormData()
    formData.append('file', file)
    const { data } = await apiClient.post('/api/v1/infer', formData, {
      headers: {
        'Content-Type': 'multipart/form-data',
      },
    })
    return data
  },
 }
--- a/frontend/src/api/endpoints/training.ts
+++ b/frontend/src/api/endpoints/training.ts
@@ -0,0 +1,74 @@
 import apiClient from '../client'
 import type { TrainingModelsResponse, DocumentListResponse } from '../types'
 export const trainingApi = {
  getDocumentsForTraining: async (params?: {
    has_annotations?: boolean
    min_annotation_count?: number
    exclude_used_in_training?: boolean
    limit?: number
    offset?: number
  }): Promise<DocumentListResponse> => {
    const { data } = await apiClient.get('/api/v1/admin/training/documents', {
      params,
    })
    return data
  },
  getModels: async (params?: {
    status?: string
    limit?: number
    offset?: number
  }): Promise<TrainingModelsResponse> => {
    const { data} = await apiClient.get('/api/v1/admin/training/models', {
      params,
    })
    return data
  },
  getTaskDetail: async (taskId: string) => {
    const { data } = await apiClient.get(`/api/v1/admin/training/tasks/${taskId}`)
    return data
  },
  startTraining: async (config: {
    name: string
    description?: string
    document_ids: string[]
    epochs?: number
    batch_size?: number
    model_base?: string
  }) => {
    // Convert frontend config to backend TrainingTaskCreate format
    const taskRequest = {
      name: config.name,
      task_type: 'yolo',
      description: config.description,
      config: {
        document_ids: config.document_ids,
        epochs: config.epochs,
        batch_size: config.batch_size,
        base_model: config.model_base,
      },
    }
    const { data } = await apiClient.post('/api/v1/admin/training/tasks', taskRequest)
    return data
  },
  cancelTask: async (taskId: string) => {
    const { data } = await apiClient.post(
      `/api/v1/admin/training/tasks/${taskId}/cancel`
    )
    return data
  },
  downloadModel: async (taskId: string): Promise<Blob> => {
    const { data } = await apiClient.get(
      `/api/v1/admin/training/models/${taskId}/download`,
      {
        responseType: 'blob',
      }
    )
    return data
  },
 }
--- a/frontend/src/api/types.ts
+++ b/frontend/src/api/types.ts
@@ -0,0 +1,173 @@
 export interface DocumentItem {
  document_id: string
  filename: string
  file_size: number
  content_type: string
  page_count: number
  status: 'pending' | 'labeled' | 'verified' | 'exported'
  auto_label_status: 'pending' | 'running' | 'completed' | 'failed' | null
  auto_label_error: string | null
  upload_source: string
  created_at: string
  updated_at: string
  annotation_count?: number
  annotation_sources?: {
    manual: number
    auto: number
    verified: number
  }
 }
 export interface DocumentListResponse {
  documents: DocumentItem[]
  total: number
  limit: number
  offset: number
 }
 export interface AnnotationItem {
  annotation_id: string
  page_number: number
  class_id: number
  class_name: string
  bbox: {
    x: number
    y: number
    width: number
    height: number
  }
  normalized_bbox: {
    x_center: number
    y_center: number
    width: number
    height: number
  }
  text_value: string | null
  confidence: number | null
  source: 'manual' | 'auto'
  created_at: string
 }
 export interface DocumentDetailResponse {
  document_id: string
  filename: string
  file_size: number
  content_type: string
  page_count: number
  status: 'pending' | 'labeled' | 'verified' | 'exported'
  auto_label_status: 'pending' | 'running' | 'completed' | 'failed' | null
  auto_label_error: string | null
  upload_source: string
  batch_id: string | null
  csv_field_values: Record<string, string> | null
  can_annotate: boolean
  annotation_lock_until: string | null
  annotations: AnnotationItem[]
  image_urls: string[]
  training_history: Array<{
    task_id: string
    name: string
    trained_at: string
    model_metrics: {
      mAP: number | null
      precision: number | null
      recall: number | null
    } | null
  }>
  created_at: string
  updated_at: string
 }
 export interface TrainingTask {
  task_id: string
  admin_token: string
  name: string
  description: string | null
  status: 'pending' | 'running' | 'completed' | 'failed'
  task_type: string
  config: Record<string, unknown>
  started_at: string | null
  completed_at: string | null
  error_message: string | null
  result_metrics: Record<string, unknown>
  model_path: string | null
  document_count: number
  metrics_mAP: number | null
  metrics_precision: number | null
  metrics_recall: number | null
  created_at: string
  updated_at: string
 }
 export interface TrainingModelsResponse {
  models: TrainingTask[]
  total: number
  limit: number
  offset: number
 }
 export interface ErrorResponse {
  detail: string
 }
 export interface UploadDocumentResponse {
  document_id: string
  filename: string
  status: string
  message: string
 }
 export interface CreateAnnotationRequest {
  page_number: number
  class_id: number
  bbox: {
    x: number
    y: number
    width: number
    height: number
  }
  text_value?: string
 }
 export interface AnnotationOverrideRequest {
  text_value?: string
  bbox?: {
    x: number
    y: number
    width: number
    height: number
  }
  class_id?: number
  class_name?: string
  reason?: string
 }
 export interface CrossValidationResult {
  is_valid: boolean
  payment_line_ocr: string | null
  payment_line_amount: string | null
  payment_line_account: string | null
  payment_line_account_type: 'bankgiro' | 'plusgiro' | null
  ocr_match: boolean | null
  amount_match: boolean | null
  bankgiro_match: boolean | null
  plusgiro_match: boolean | null
  details: string[]
 }
 export interface InferenceResult {
  document_id: string
  document_type: string
  success: boolean
  fields: Record<string, string>
  confidence: Record<string, number>
  cross_validation: CrossValidationResult | null
  processing_time_ms: number
  visualization_url: string | null
  errors: string[]
  fallback_used: boolean
 }
 export interface InferenceResponse {
  result: InferenceResult
 }
--- a/frontend/src/components/Badge.tsx
+++ b/frontend/src/components/Badge.tsx
@@ -0,0 +1,39 @@
 import React from 'react';
 import { DocumentStatus } from '../types';
 import { Check } from 'lucide-react';
 interface BadgeProps {
  status: DocumentStatus | 'Exported';
 }
 export const Badge: React.FC<BadgeProps> = ({ status }) => {
  if (status === 'Exported') {
    return (
      <span className="inline-flex items-center gap-1.5 px-2.5 py-1 rounded-full text-xs font-medium bg-warm-selected text-warm-text-secondary">
        <Check size={12} strokeWidth={3} />
        Exported
      </span>
    );
  }
  const styles = {
    [DocumentStatus.PENDING]: "bg-white border border-warm-divider text-warm-text-secondary",
    [DocumentStatus.LABELED]: "bg-warm-text-secondary text-white border border-transparent",
    [DocumentStatus.VERIFIED]: "bg-warm-state-success/10 text-warm-state-success border border-warm-state-success/20",
    [DocumentStatus.PARTIAL]: "bg-warm-state-warning/10 text-warm-state-warning border border-warm-state-warning/20",
  };
  const icons = {
    [DocumentStatus.VERIFIED]: <Check size={12} className="mr-1" />,
    [DocumentStatus.PARTIAL]: <span className="mr-1 text-[10px] font-bold">!</span>,
    [DocumentStatus.PENDING]: null,
    [DocumentStatus.LABELED]: null,
  }
  return (
    <span className={`inline-flex items-center px-3 py-1 rounded-full text-xs font-medium border ${styles[status]}`}>
      {icons[status]}
      {status}
    </span>
  );
 };
--- a/frontend/src/components/Button.tsx
+++ b/frontend/src/components/Button.tsx
@@ -0,0 +1,38 @@
 import React from 'react';
 interface ButtonProps extends React.ButtonHTMLAttributes<HTMLButtonElement> {
  variant?: 'primary' | 'secondary' | 'outline' | 'text';
  size?: 'sm' | 'md' | 'lg';
 }
 export const Button: React.FC<ButtonProps> = ({ 
  variant = 'primary', 
  size = 'md', 
  className = '', 
  children, 
  ...props 
 }) => {
  const baseStyles = "inline-flex items-center justify-center rounded-md font-medium transition-all duration-150 ease-out active:scale-98 disabled:opacity-50 disabled:pointer-events-none";
  const variants = {
    primary: "bg-warm-text-secondary text-white hover:bg-warm-text-primary shadow-sm",
    secondary: "bg-white border border-warm-divider text-warm-text-secondary hover:bg-warm-hover",
    outline: "bg-transparent border border-warm-text-secondary text-warm-text-secondary hover:bg-warm-hover",
    text: "text-warm-text-muted hover:text-warm-text-primary hover:bg-warm-hover",
  };
  const sizes = {
    sm: "h-8 px-3 text-xs",
    md: "h-10 px-4 text-sm",
    lg: "h-12 px-6 text-base",
  };
  return (
    <button 
      className={`${baseStyles} ${variants[variant]} ${sizes[size]} ${className}`}
      {...props}
    >
      {children}
    </button>
  );
 };
--- a/frontend/src/components/Dashboard.tsx
+++ b/frontend/src/components/Dashboard.tsx
@@ -0,0 +1,266 @@
 import React, { useState } from 'react'
 import { Search, ChevronDown, MoreHorizontal, FileText } from 'lucide-react'
 import { Badge } from './Badge'
 import { Button } from './Button'
 import { UploadModal } from './UploadModal'
 import { useDocuments } from '../hooks/useDocuments'
 import type { DocumentItem } from '../api/types'
 interface DashboardProps {
  onNavigate: (view: string, docId?: string) => void
 }
 const getStatusForBadge = (status: string): string => {
  const statusMap: Record<string, string> = {
    pending: 'Pending',
    labeled: 'Labeled',
    verified: 'Verified',
    exported: 'Exported',
  }
  return statusMap[status] || status
 }
 const getAutoLabelProgress = (doc: DocumentItem): number | undefined => {
  if (doc.auto_label_status === 'running') {
    return 45
  }
  if (doc.auto_label_status === 'completed') {
    return 100
  }
  return undefined
 }
 export const Dashboard: React.FC<DashboardProps> = ({ onNavigate }) => {
  const [isUploadOpen, setIsUploadOpen] = useState(false)
  const [selectedDocs, setSelectedDocs] = useState<Set<string>>(new Set())
  const [statusFilter, setStatusFilter] = useState<string>('')
  const [limit] = useState(20)
  const [offset] = useState(0)
  const { documents, total, isLoading, error, refetch } = useDocuments({
    status: statusFilter || undefined,
    limit,
    offset,
  })
  const toggleSelection = (id: string) => {
    const newSet = new Set(selectedDocs)
    if (newSet.has(id)) {
      newSet.delete(id)
    } else {
      newSet.add(id)
    }
    setSelectedDocs(newSet)
  }
  if (error) {
    return (
      <div className="p-8 max-w-7xl mx-auto">
        <div className="bg-red-50 border border-red-200 text-red-800 p-4 rounded-lg">
          Error loading documents. Please check your connection to the backend API.
          <button
            onClick={() => refetch()}
            className="ml-4 underline hover:no-underline"
          >
            Retry
          </button>
        </div>
      </div>
    )
  }
  return (
    <div className="p-8 max-w-7xl mx-auto animate-fade-in">
      <div className="flex items-center justify-between mb-8">
        <div>
          <h1 className="text-3xl font-bold text-warm-text-primary tracking-tight">
            Documents
          </h1>
          <p className="text-sm text-warm-text-muted mt-1">
            {isLoading ? 'Loading...' : `${total} documents total`}
          </p>
        </div>
        <div className="flex gap-3">
          <Button variant="secondary" disabled={selectedDocs.size === 0}>
            Export Selection ({selectedDocs.size})
          </Button>
          <Button onClick={() => setIsUploadOpen(true)}>Upload Documents</Button>
        </div>
      </div>
      <div className="bg-warm-card border border-warm-border rounded-lg p-4 mb-6 shadow-sm flex flex-wrap gap-4 items-center">
        <div className="relative flex-1 min-w-[200px]">
          <Search
            className="absolute left-3 top-1/2 -translate-y-1/2 text-warm-text-muted"
            size={16}
          />
          <input
            type="text"
            placeholder="Search documents..."
            className="w-full pl-9 pr-4 h-10 rounded-md border border-warm-border bg-white focus:outline-none focus:ring-1 focus:ring-warm-state-info transition-shadow text-sm"
          />
        </div>
        <div className="flex gap-3">
          <div className="relative">
            <select
              value={statusFilter}
              onChange={(e) => setStatusFilter(e.target.value)}
              className="h-10 pl-3 pr-8 rounded-md border border-warm-border bg-white text-sm text-warm-text-secondary focus:outline-none appearance-none cursor-pointer hover:bg-warm-hover"
            >
              <option value="">All Statuses</option>
              <option value="pending">Pending</option>
              <option value="labeled">Labeled</option>
              <option value="verified">Verified</option>
              <option value="exported">Exported</option>
            </select>
            <ChevronDown
              className="absolute right-2.5 top-1/2 -translate-y-1/2 pointer-events-none text-warm-text-muted"
              size={14}
            />
          </div>
        </div>
      </div>
      <div className="bg-warm-card border border-warm-border rounded-lg shadow-sm overflow-hidden">
        <table className="w-full text-left border-collapse">
          <thead>
            <tr className="border-b border-warm-border bg-white">
              <th className="py-3 pl-6 pr-4 w-12">
                <input
                  type="checkbox"
                  className="rounded border-warm-divider text-warm-text-primary focus:ring-warm-text-secondary"
                />
              </th>
              <th className="py-3 px-4 text-xs font-semibold text-warm-text-muted uppercase tracking-wider">
                Document Name
              </th>
              <th className="py-3 px-4 text-xs font-semibold text-warm-text-muted uppercase tracking-wider">
                Date
              </th>
              <th className="py-3 px-4 text-xs font-semibold text-warm-text-muted uppercase tracking-wider">
                Status
              </th>
              <th className="py-3 px-4 text-xs font-semibold text-warm-text-muted uppercase tracking-wider">
                Annotations
              </th>
              <th className="py-3 px-4 text-xs font-semibold text-warm-text-muted uppercase tracking-wider w-64">
                Auto-label
              </th>
              <th className="py-3 px-4 w-12"></th>
            </tr>
          </thead>
          <tbody>
            {isLoading ? (
              <tr>
                <td colSpan={7} className="py-8 text-center text-warm-text-muted">
                  Loading documents...
                </td>
              </tr>
            ) : documents.length === 0 ? (
              <tr>
                <td colSpan={7} className="py-8 text-center text-warm-text-muted">
                  No documents found. Upload your first document to get started.
                </td>
              </tr>
            ) : (
              documents.map((doc) => {
                const isSelected = selectedDocs.has(doc.document_id)
                const progress = getAutoLabelProgress(doc)
                return (
                  <tr
                    key={doc.document_id}
                    onClick={() => onNavigate('detail', doc.document_id)}
                    className={`
                    group transition-colors duration-150 cursor-pointer border-b border-warm-border last:border-0
                    ${isSelected ? 'bg-warm-selected' : 'hover:bg-warm-hover bg-white'}
                  `}
                  >
                    <td
                      className="py-4 pl-6 pr-4 relative"
                      onClick={(e) => {
                        e.stopPropagation()
                        toggleSelection(doc.document_id)
                      }}
                    >
                      {isSelected && (
                        <div className="absolute left-0 top-0 bottom-0 w-[3px] bg-warm-state-info" />
                      )}
                      <input
                        type="checkbox"
                        checked={isSelected}
                        readOnly
                        className="rounded border-warm-divider text-warm-text-primary focus:ring-warm-text-secondary cursor-pointer"
                      />
                    </td>
                    <td className="py-4 px-4">
                      <div className="flex items-center gap-3">
                        <div className="p-2 bg-warm-bg rounded border border-warm-border text-warm-text-muted">
                          <FileText size={16} />
                        </div>
                        <span className="font-medium text-warm-text-secondary">
                          {doc.filename}
                        </span>
                      </div>
                    </td>
                    <td className="py-4 px-4 text-sm text-warm-text-secondary font-mono">
                      {new Date(doc.created_at).toLocaleDateString()}
                    </td>
                    <td className="py-4 px-4">
                      <Badge status={getStatusForBadge(doc.status)} />
                    </td>
                    <td className="py-4 px-4 text-sm text-warm-text-secondary">
                      {doc.annotation_count || 0} annotations
                    </td>
                    <td className="py-4 px-4">
                      {doc.auto_label_status === 'running' && progress && (
                        <div className="w-full">
                          <div className="flex justify-between text-xs mb-1">
                            <span className="text-warm-text-secondary font-medium">
                              Running
                            </span>
                            <span className="text-warm-text-muted">{progress}%</span>
                          </div>
                          <div className="h-1.5 w-full bg-warm-selected rounded-full overflow-hidden">
                            <div
                              className="h-full bg-warm-state-info transition-all duration-500 ease-out"
                              style={{ width: `${progress}%` }}
                            />
                          </div>
                        </div>
                      )}
                      {doc.auto_label_status === 'completed' && (
                        <span className="text-sm font-medium text-warm-state-success">
                          Completed
                        </span>
                      )}
                      {doc.auto_label_status === 'failed' && (
                        <span className="text-sm font-medium text-warm-state-error">
                          Failed
                        </span>
                      )}
                    </td>
                    <td className="py-4 px-4 text-right">
                      <button className="text-warm-text-muted hover:text-warm-text-secondary p-1 rounded hover:bg-black/5 transition-colors">
                        <MoreHorizontal size={18} />
                      </button>
                    </td>
                  </tr>
                )
              })
            )}
          </tbody>
        </table>
      </div>
      <UploadModal
        isOpen={isUploadOpen}
        onClose={() => {
          setIsUploadOpen(false)
          refetch()
        }}
      />
    </div>
  )
 }
--- a/frontend/src/components/DashboardOverview.tsx
+++ b/frontend/src/components/DashboardOverview.tsx
@@ -0,0 +1,148 @@
 import React from 'react'
 import { FileText, CheckCircle, Clock, TrendingUp, Activity } from 'lucide-react'
 import { Button } from './Button'
 import { useDocuments } from '../hooks/useDocuments'
 import { useTraining } from '../hooks/useTraining'
 interface DashboardOverviewProps {
  onNavigate: (view: string) => void
 }
 export const DashboardOverview: React.FC<DashboardOverviewProps> = ({ onNavigate }) => {
  const { total: totalDocs, isLoading: docsLoading } = useDocuments({ limit: 1 })
  const { models, isLoadingModels } = useTraining()
  const stats = [
    {
      label: 'Total Documents',
      value: docsLoading ? '...' : totalDocs.toString(),
      icon: FileText,
      color: 'text-warm-text-primary',
      bgColor: 'bg-warm-bg',
    },
    {
      label: 'Labeled',
      value: '0',
      icon: CheckCircle,
      color: 'text-warm-state-success',
      bgColor: 'bg-green-50',
    },
    {
      label: 'Pending',
      value: '0',
      icon: Clock,
      color: 'text-warm-state-warning',
      bgColor: 'bg-yellow-50',
    },
    {
      label: 'Training Models',
      value: isLoadingModels ? '...' : models.length.toString(),
      icon: TrendingUp,
      color: 'text-warm-state-info',
      bgColor: 'bg-blue-50',
    },
  ]
  return (
    <div className="p-8 max-w-7xl mx-auto animate-fade-in">
      {/* Header */}
      <div className="mb-8">
        <h1 className="text-3xl font-bold text-warm-text-primary tracking-tight">
          Dashboard
        </h1>
        <p className="text-sm text-warm-text-muted mt-1">
          Overview of your document annotation system
        </p>
      </div>
      {/* Stats Grid */}
      <div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-4 gap-6 mb-8">
        {stats.map((stat) => (
          <div
            key={stat.label}
            className="bg-warm-card border border-warm-border rounded-lg p-6 shadow-sm hover:shadow-md transition-shadow"
          >
            <div className="flex items-center justify-between mb-4">
              <div className={`p-3 rounded-lg ${stat.bgColor}`}>
                <stat.icon className={stat.color} size={24} />
              </div>
            </div>
            <p className="text-2xl font-bold text-warm-text-primary mb-1">
              {stat.value}
            </p>
            <p className="text-sm text-warm-text-muted">{stat.label}</p>
          </div>
        ))}
      </div>
      {/* Quick Actions */}
      <div className="bg-warm-card border border-warm-border rounded-lg p-6 shadow-sm mb-8">
        <h2 className="text-lg font-semibold text-warm-text-primary mb-4">
          Quick Actions
        </h2>
        <div className="grid grid-cols-1 md:grid-cols-3 gap-4">
          <Button onClick={() => onNavigate('documents')} className="justify-start">
            <FileText size={18} className="mr-2" />
            Manage Documents
          </Button>
          <Button onClick={() => onNavigate('training')} variant="secondary" className="justify-start">
            <Activity size={18} className="mr-2" />
            Start Training
          </Button>
          <Button onClick={() => onNavigate('models')} variant="secondary" className="justify-start">
            <TrendingUp size={18} className="mr-2" />
            View Models
          </Button>
        </div>
      </div>
      {/* Recent Activity */}
      <div className="bg-warm-card border border-warm-border rounded-lg shadow-sm overflow-hidden">
        <div className="p-6 border-b border-warm-border">
          <h2 className="text-lg font-semibold text-warm-text-primary">
            Recent Activity
          </h2>
        </div>
        <div className="p-6">
          <div className="text-center py-8 text-warm-text-muted">
            <Activity size={48} className="mx-auto mb-3 opacity-20" />
            <p className="text-sm">No recent activity</p>
            <p className="text-xs mt-1">
              Start by uploading documents or creating training jobs
            </p>
          </div>
        </div>
      </div>
      {/* System Status */}
      <div className="mt-8 bg-warm-card border border-warm-border rounded-lg p-6 shadow-sm">
        <h2 className="text-lg font-semibold text-warm-text-primary mb-4">
          System Status
        </h2>
        <div className="space-y-3">
          <div className="flex items-center justify-between">
            <span className="text-sm text-warm-text-secondary">Backend API</span>
            <span className="flex items-center text-sm text-warm-state-success">
              <span className="w-2 h-2 bg-green-500 rounded-full mr-2"></span>
              Online
            </span>
          </div>
          <div className="flex items-center justify-between">
            <span className="text-sm text-warm-text-secondary">Database</span>
            <span className="flex items-center text-sm text-warm-state-success">
              <span className="w-2 h-2 bg-green-500 rounded-full mr-2"></span>
              Connected
            </span>
          </div>
          <div className="flex items-center justify-between">
            <span className="text-sm text-warm-text-secondary">GPU</span>
            <span className="flex items-center text-sm text-warm-state-success">
              <span className="w-2 h-2 bg-green-500 rounded-full mr-2"></span>
              Available
            </span>
          </div>
        </div>
      </div>
    </div>
  )
 }
--- a/frontend/src/components/DocumentDetail.tsx
+++ b/frontend/src/components/DocumentDetail.tsx
@@ -0,0 +1,504 @@
 import React, { useState, useRef, useEffect } from 'react'
 import { ChevronLeft, ZoomIn, ZoomOut, Plus, Edit2, Trash2, Tag, CheckCircle } from 'lucide-react'
 import { Button } from './Button'
 import { useDocumentDetail } from '../hooks/useDocumentDetail'
 import { useAnnotations } from '../hooks/useAnnotations'
 import { documentsApi } from '../api/endpoints/documents'
 import type { AnnotationItem } from '../api/types'
 interface DocumentDetailProps {
  docId: string
  onBack: () => void
 }
 // Field class mapping from backend
 const FIELD_CLASSES: Record<number, string> = {
  0: 'invoice_number',
  1: 'invoice_date',
  2: 'invoice_due_date',
  3: 'ocr_number',
  4: 'bankgiro',
  5: 'plusgiro',
  6: 'amount',
  7: 'supplier_organisation_number',
  8: 'payment_line',
  9: 'customer_number',
 }
 export const DocumentDetail: React.FC<DocumentDetailProps> = ({ docId, onBack }) => {
  const { document, annotations, isLoading } = useDocumentDetail(docId)
  const {
    createAnnotation,
    updateAnnotation,
    deleteAnnotation,
    isCreating,
    isDeleting,
  } = useAnnotations(docId)
  const [selectedId, setSelectedId] = useState<string | null>(null)
  const [zoom, setZoom] = useState(100)
  const [isDrawing, setIsDrawing] = useState(false)
  const [drawStart, setDrawStart] = useState<{ x: number; y: number } | null>(null)
  const [drawEnd, setDrawEnd] = useState<{ x: number; y: number } | null>(null)
  const [selectedClassId, setSelectedClassId] = useState<number>(0)
  const [currentPage, setCurrentPage] = useState(1)
  const [imageSize, setImageSize] = useState<{ width: number; height: number } | null>(null)
  const [imageBlobUrl, setImageBlobUrl] = useState<string | null>(null)
  const canvasRef = useRef<HTMLDivElement>(null)
  const imageRef = useRef<HTMLImageElement>(null)
  const [isMarkingComplete, setIsMarkingComplete] = useState(false)
  const selectedAnnotation = annotations?.find((a) => a.annotation_id === selectedId)
  // Handle mark as complete
  const handleMarkComplete = async () => {
    if (!annotations || annotations.length === 0) {
      alert('Please add at least one annotation before marking as complete.')
      return
    }
    if (!confirm('Mark this document as labeled? This will save annotations to the database.')) {
      return
    }
    setIsMarkingComplete(true)
    try {
      const result = await documentsApi.updateStatus(docId, 'labeled')
      alert(`Document marked as labeled. ${(result as any).fields_saved || annotations.length} annotations saved.`)
      onBack() // Return to document list
    } catch (error) {
      console.error('Failed to mark document as complete:', error)
      alert('Failed to mark document as complete. Please try again.')
    } finally {
      setIsMarkingComplete(false)
    }
  }
  // Load image via fetch with authentication header
  useEffect(() => {
    let objectUrl: string | null = null
    const loadImage = async () => {
      if (!docId) return
      const token = localStorage.getItem('admin_token')
      const imageUrl = `${import.meta.env.VITE_API_URL || 'http://localhost:8000'}/api/v1/admin/documents/${docId}/images/${currentPage}`
      try {
        const response = await fetch(imageUrl, {
          headers: {
            'X-Admin-Token': token || '',
          },
        })
        if (!response.ok) {
          throw new Error(`Failed to load image: ${response.status}`)
        }
        const blob = await response.blob()
        objectUrl = URL.createObjectURL(blob)
        setImageBlobUrl(objectUrl)
      } catch (error) {
        console.error('Failed to load image:', error)
      }
    }
    loadImage()
    // Cleanup: revoke object URL when component unmounts or page changes
    return () => {
      if (objectUrl) {
        URL.revokeObjectURL(objectUrl)
      }
    }
  }, [currentPage, docId])
  // Load image size
  useEffect(() => {
    if (imageRef.current && imageRef.current.complete) {
      setImageSize({
        width: imageRef.current.naturalWidth,
        height: imageRef.current.naturalHeight,
      })
    }
  }, [imageBlobUrl])
  const handleImageLoad = () => {
    if (imageRef.current) {
      setImageSize({
        width: imageRef.current.naturalWidth,
        height: imageRef.current.naturalHeight,
      })
    }
  }
  const handleMouseDown = (e: React.MouseEvent<HTMLDivElement>) => {
    if (!canvasRef.current || !imageSize) return
    const rect = canvasRef.current.getBoundingClientRect()
    const x = (e.clientX - rect.left) / (zoom / 100)
    const y = (e.clientY - rect.top) / (zoom / 100)
    setIsDrawing(true)
    setDrawStart({ x, y })
    setDrawEnd({ x, y })
  }
  const handleMouseMove = (e: React.MouseEvent<HTMLDivElement>) => {
    if (!isDrawing || !canvasRef.current || !imageSize) return
    const rect = canvasRef.current.getBoundingClientRect()
    const x = (e.clientX - rect.left) / (zoom / 100)
    const y = (e.clientY - rect.top) / (zoom / 100)
    setDrawEnd({ x, y })
  }
  const handleMouseUp = () => {
    if (!isDrawing || !drawStart || !drawEnd || !imageSize) {
      setIsDrawing(false)
      return
    }
    const bbox_x = Math.min(drawStart.x, drawEnd.x)
    const bbox_y = Math.min(drawStart.y, drawEnd.y)
    const bbox_width = Math.abs(drawEnd.x - drawStart.x)
    const bbox_height = Math.abs(drawEnd.y - drawStart.y)
    // Only create if box is large enough (min 10x10 pixels)
    if (bbox_width > 10 && bbox_height > 10) {
      createAnnotation({
        page_number: currentPage,
        class_id: selectedClassId,
        bbox: {
          x: Math.round(bbox_x),
          y: Math.round(bbox_y),
          width: Math.round(bbox_width),
          height: Math.round(bbox_height),
        },
      })
    }
    setIsDrawing(false)
    setDrawStart(null)
    setDrawEnd(null)
  }
  const handleDeleteAnnotation = (annotationId: string) => {
    if (confirm('Are you sure you want to delete this annotation?')) {
      deleteAnnotation(annotationId)
      setSelectedId(null)
    }
  }
  if (isLoading || !document) {
    return (
      <div className="flex h-screen items-center justify-center">
        <div className="text-warm-text-muted">Loading...</div>
      </div>
    )
  }
  // Get current page annotations
  const pageAnnotations = annotations?.filter((a) => a.page_number === currentPage) || []
  return (
    <div className="flex h-[calc(100vh-56px)] overflow-hidden">
      {/* Main Canvas Area */}
      <div className="flex-1 bg-warm-bg flex flex-col relative">
        {/* Toolbar */}
        <div className="h-14 border-b border-warm-border bg-white flex items-center justify-between px-4 z-10">
          <div className="flex items-center gap-4">
            <button
              onClick={onBack}
              className="p-2 hover:bg-warm-hover rounded-md text-warm-text-secondary transition-colors"
            >
              <ChevronLeft size={20} />
            </button>
            <div>
              <h2 className="text-sm font-semibold text-warm-text-primary">{document.filename}</h2>
              <p className="text-xs text-warm-text-muted">
                Page {currentPage} of {document.page_count}
              </p>
            </div>
            <div className="h-6 w-px bg-warm-divider mx-2" />
            <div className="flex items-center gap-2">
              <button
                className="p-1.5 hover:bg-warm-hover rounded text-warm-text-secondary"
                onClick={() => setZoom((z) => Math.max(50, z - 10))}
              >
                <ZoomOut size={16} />
              </button>
              <span className="text-xs font-mono w-12 text-center text-warm-text-secondary">
                {zoom}%
              </span>
              <button
                className="p-1.5 hover:bg-warm-hover rounded text-warm-text-secondary"
                onClick={() => setZoom((z) => Math.min(200, z + 10))}
              >
                <ZoomIn size={16} />
              </button>
            </div>
          </div>
          <div className="flex gap-2">
            <Button variant="secondary" size="sm">
              Auto-label
            </Button>
            <Button
              variant="primary"
              size="sm"
              onClick={handleMarkComplete}
              disabled={isMarkingComplete || document.status === 'labeled'}
            >
              <CheckCircle size={16} className="mr-1" />
              {isMarkingComplete ? 'Saving...' : document.status === 'labeled' ? 'Labeled' : 'Mark Complete'}
            </Button>
            {document.page_count > 1 && (
              <div className="flex gap-1">
                <Button
                  variant="secondary"
                  size="sm"
                  onClick={() => setCurrentPage((p) => Math.max(1, p - 1))}
                  disabled={currentPage === 1}
                >
                  Prev
                </Button>
                <Button
                  variant="secondary"
                  size="sm"
                  onClick={() => setCurrentPage((p) => Math.min(document.page_count, p + 1))}
                  disabled={currentPage === document.page_count}
                >
                  Next
                </Button>
              </div>
            )}
          </div>
        </div>
        {/* Canvas Scroll Area */}
        <div className="flex-1 overflow-auto p-8 flex justify-center bg-warm-bg">
          <div
            ref={canvasRef}
            className="bg-white shadow-lg relative transition-transform duration-200 ease-out origin-top"
            style={{
              width: imageSize?.width || 800,
              height: imageSize?.height || 1132,
              transform: `scale(${zoom / 100})`,
              marginBottom: '100px',
              cursor: isDrawing ? 'crosshair' : 'default',
            }}
            onMouseDown={handleMouseDown}
            onMouseMove={handleMouseMove}
            onMouseUp={handleMouseUp}
            onClick={() => setSelectedId(null)}
          >
            {/* Document Image */}
            {imageBlobUrl ? (
              <img
                ref={imageRef}
                src={imageBlobUrl}
                alt={`Page ${currentPage}`}
                className="w-full h-full object-contain select-none pointer-events-none"
                onLoad={handleImageLoad}
              />
            ) : (
              <div className="flex items-center justify-center h-full">
                <div className="text-warm-text-muted">Loading image...</div>
              </div>
            )}
            {/* Annotation Overlays */}
            {pageAnnotations.map((ann) => {
              const isSelected = selectedId === ann.annotation_id
              return (
                <div
                  key={ann.annotation_id}
                  onClick={(e) => {
                    e.stopPropagation()
                    setSelectedId(ann.annotation_id)
                  }}
                  className={`
                    absolute group cursor-pointer transition-all duration-100
                    ${
                      ann.source === 'auto'
                        ? 'border border-dashed border-warm-text-muted bg-transparent'
                        : 'border-2 border-warm-text-secondary bg-warm-text-secondary/5'
                    }
                    ${
                      isSelected
                        ? 'border-2 border-warm-state-info ring-4 ring-warm-state-info/10 z-20'
                        : 'hover:bg-warm-state-info/5 z-10'
                    }
                  `}
                  style={{
                    left: ann.bbox.x,
                    top: ann.bbox.y,
                    width: ann.bbox.width,
                    height: ann.bbox.height,
                  }}
                >
                  {/* Label Tag */}
                  <div
                    className={`
                    absolute -top-6 left-0 text-[10px] uppercase font-bold px-1.5 py-0.5 rounded-sm tracking-wide shadow-sm whitespace-nowrap
                    ${
                      isSelected
                        ? 'bg-warm-state-info text-white'
                        : 'bg-white text-warm-text-secondary border border-warm-border'
                    }
                  `}
                  >
                    {ann.class_name}
                  </div>
                  {/* Resize Handles (Visual only) */}
                  {isSelected && (
                    <>
                      <div className="absolute -top-1 -left-1 w-2 h-2 bg-white border border-warm-state-info rounded-full" />
                      <div className="absolute -top-1 -right-1 w-2 h-2 bg-white border border-warm-state-info rounded-full" />
                      <div className="absolute -bottom-1 -left-1 w-2 h-2 bg-white border border-warm-state-info rounded-full" />
                      <div className="absolute -bottom-1 -right-1 w-2 h-2 bg-white border border-warm-state-info rounded-full" />
                    </>
                  )}
                </div>
              )
            })}
            {/* Drawing Box Preview */}
            {isDrawing && drawStart && drawEnd && (
              <div
                className="absolute border-2 border-warm-state-info bg-warm-state-info/10 z-30 pointer-events-none"
                style={{
                  left: Math.min(drawStart.x, drawEnd.x),
                  top: Math.min(drawStart.y, drawEnd.y),
                  width: Math.abs(drawEnd.x - drawStart.x),
                  height: Math.abs(drawEnd.y - drawStart.y),
                }}
              />
            )}
          </div>
        </div>
      </div>
      {/* Right Sidebar */}
      <div className="w-80 bg-white border-l border-warm-border flex flex-col shadow-[-4px_0_15px_-3px_rgba(0,0,0,0.03)] z-20">
        {/* Field Selector */}
        <div className="p-4 border-b border-warm-border">
          <h3 className="text-sm font-semibold text-warm-text-primary mb-3">Draw Annotation</h3>
          <div className="space-y-2">
            <label className="block text-xs text-warm-text-muted mb-1">Select Field Type</label>
            <select
              value={selectedClassId}
              onChange={(e) => setSelectedClassId(Number(e.target.value))}
              className="w-full px-3 py-2 border border-warm-border rounded-md text-sm focus:outline-none focus:ring-1 focus:ring-warm-state-info"
            >
              {Object.entries(FIELD_CLASSES).map(([id, name]) => (
                <option key={id} value={id}>
                  {name.replace(/_/g, ' ')}
                </option>
              ))}
            </select>
            <p className="text-xs text-warm-text-muted mt-2">
              Click and drag on the document to create a bounding box
            </p>
          </div>
        </div>
        {/* Document Info Card */}
        <div className="p-4 border-b border-warm-border">
          <div className="bg-white rounded-lg border border-warm-border p-4 shadow-sm">
            <h3 className="text-sm font-semibold text-warm-text-primary mb-3">Document Info</h3>
            <div className="space-y-2">
              <div className="flex justify-between text-xs">
                <span className="text-warm-text-muted">Status</span>
                <span className="text-warm-text-secondary font-medium capitalize">
                  {document.status}
                </span>
              </div>
              <div className="flex justify-between text-xs">
                <span className="text-warm-text-muted">Size</span>
                <span className="text-warm-text-secondary font-medium">
                  {(document.file_size / 1024 / 1024).toFixed(2)} MB
                </span>
              </div>
              <div className="flex justify-between text-xs">
                <span className="text-warm-text-muted">Uploaded</span>
                <span className="text-warm-text-secondary font-medium">
                  {new Date(document.created_at).toLocaleDateString()}
                </span>
              </div>
            </div>
          </div>
        </div>
        {/* Annotations List */}
        <div className="flex-1 overflow-y-auto p-4">
          <div className="flex items-center justify-between mb-4">
            <h3 className="text-sm font-semibold text-warm-text-primary">Annotations</h3>
            <span className="text-xs text-warm-text-muted">{pageAnnotations.length} items</span>
          </div>
          {pageAnnotations.length === 0 ? (
            <div className="text-center py-8 text-warm-text-muted">
              <Tag size={48} className="mx-auto mb-3 opacity-20" />
              <p className="text-sm">No annotations yet</p>
              <p className="text-xs mt-1">Draw on the document to add annotations</p>
            </div>
          ) : (
            <div className="space-y-3">
              {pageAnnotations.map((ann) => (
                <div
                  key={ann.annotation_id}
                  onClick={() => setSelectedId(ann.annotation_id)}
                  className={`
                   group p-3 rounded-md border transition-all duration-150 cursor-pointer
                   ${
                     selectedId === ann.annotation_id
                       ? 'bg-warm-bg border-warm-state-info shadow-sm'
                       : 'bg-white border-warm-border hover:border-warm-text-muted'
                   }
                 `}
                >
                  <div className="flex justify-between items-start mb-1">
                    <span className="text-xs font-bold text-warm-text-secondary uppercase tracking-wider">
                      {ann.class_name.replace(/_/g, ' ')}
                    </span>
                    {selectedId === ann.annotation_id && (
                      <div className="flex gap-1">
                        <button
                          onClick={() => handleDeleteAnnotation(ann.annotation_id)}
                          className="text-warm-text-muted hover:text-warm-state-error"
                          disabled={isDeleting}
                        >
                          <Trash2 size={12} />
                        </button>
                      </div>
                    )}
                  </div>
                  <p className="text-sm text-warm-text-muted font-mono truncate">
                    {ann.text_value || '(no text)'}
                  </p>
                  <div className="flex items-center gap-2 mt-2">
                    <span
                      className={`text-[10px] px-1.5 py-0.5 rounded ${
                        ann.source === 'auto'
                          ? 'bg-blue-50 text-blue-700'
                          : 'bg-green-50 text-green-700'
                      }`}
                    >
                      {ann.source}
                    </span>
                    {ann.confidence && (
                      <span className="text-[10px] text-warm-text-muted">
                        {(ann.confidence * 100).toFixed(0)}%
                      </span>
                    )}
                  </div>
                </div>
              ))}
            </div>
          )}
        </div>
      </div>
    </div>
  )
 }
--- a/frontend/src/components/InferenceDemo.tsx
+++ b/frontend/src/components/InferenceDemo.tsx
@@ -0,0 +1,466 @@
 import React, { useState, useRef } from 'react'
 import { UploadCloud, FileText, Loader2, CheckCircle2, AlertCircle, Clock } from 'lucide-react'
 import { Button } from './Button'
 import { inferenceApi } from '../api/endpoints'
 import type { InferenceResult } from '../api/types'
 export const InferenceDemo: React.FC = () => {
  const [isDragging, setIsDragging] = useState(false)
  const [selectedFile, setSelectedFile] = useState<File | null>(null)
  const [isProcessing, setIsProcessing] = useState(false)
  const [result, setResult] = useState<InferenceResult | null>(null)
  const [error, setError] = useState<string | null>(null)
  const fileInputRef = useRef<HTMLInputElement>(null)
  const handleFileSelect = (file: File | null) => {
    if (!file) return
    const validTypes = ['application/pdf', 'image/png', 'image/jpeg', 'image/jpg']
    if (!validTypes.includes(file.type)) {
      setError('Please upload a PDF, PNG, or JPG file')
      return
    }
    if (file.size > 50 * 1024 * 1024) {
      setError('File size must be less than 50MB')
      return
    }
    setSelectedFile(file)
    setResult(null)
    setError(null)
  }
  const handleDrop = (e: React.DragEvent) => {
    e.preventDefault()
    setIsDragging(false)
    if (e.dataTransfer.files.length > 0) {
      handleFileSelect(e.dataTransfer.files[0])
    }
  }
  const handleBrowseClick = () => {
    fileInputRef.current?.click()
  }
  const handleProcess = async () => {
    if (!selectedFile) return
    setIsProcessing(true)
    setError(null)
    try {
      const response = await inferenceApi.processDocument(selectedFile)
      console.log('API Response:', response)
      console.log('Visualization URL:', response.result?.visualization_url)
      setResult(response.result)
    } catch (err) {
      setError(err instanceof Error ? err.message : 'Processing failed')
    } finally {
      setIsProcessing(false)
    }
  }
  const handleReset = () => {
    setSelectedFile(null)
    setResult(null)
    setError(null)
  }
  const formatFieldName = (field: string): string => {
    const fieldNames: Record<string, string> = {
      InvoiceNumber: 'Invoice Number',
      InvoiceDate: 'Invoice Date',
      InvoiceDueDate: 'Due Date',
      OCR: 'OCR Number',
      Amount: 'Amount',
      Bankgiro: 'Bankgiro',
      Plusgiro: 'Plusgiro',
      supplier_org_number: 'Supplier Org Number',
      customer_number: 'Customer Number',
      payment_line: 'Payment Line',
    }
    return fieldNames[field] || field
  }
  return (
    <div className="max-w-7xl mx-auto px-4 py-6 space-y-6">
      {/* Header */}
      <div className="text-center">
        <h2 className="text-3xl font-bold text-warm-text-primary mb-2">
          Invoice Extraction Demo
        </h2>
        <p className="text-warm-text-muted">
          Upload a Swedish invoice to see our AI-powered field extraction in action
        </p>
      </div>
      {/* Upload Area */}
      {!result && (
        <div className="max-w-2xl mx-auto">
          <div className="bg-warm-card rounded-xl border border-warm-border p-8 shadow-sm">
            <div
              className={`
                relative h-72 rounded-xl border-2 border-dashed transition-all duration-200
                ${isDragging
                  ? 'border-warm-text-secondary bg-warm-selected scale-[1.02]'
                  : 'border-warm-divider bg-warm-bg hover:bg-warm-hover hover:border-warm-text-secondary/50'
                }
                ${isProcessing ? 'opacity-60 pointer-events-none' : 'cursor-pointer'}
              `}
              onDragOver={(e) => {
                e.preventDefault()
                setIsDragging(true)
              }}
              onDragLeave={() => setIsDragging(false)}
              onDrop={handleDrop}
              onClick={handleBrowseClick}
            >
              <div className="absolute inset-0 flex flex-col items-center justify-center gap-6">
                {isProcessing ? (
                  <>
                    <Loader2 size={56} className="text-warm-text-secondary animate-spin" />
                    <div className="text-center">
                      <p className="text-lg font-semibold text-warm-text-primary mb-1">
                        Processing invoice...
                      </p>
                      <p className="text-sm text-warm-text-muted">
                        This may take a few moments
                      </p>
                    </div>
                  </>
                ) : selectedFile ? (
                  <>
                    <div className="p-5 bg-warm-text-secondary/10 rounded-full">
                      <FileText size={40} className="text-warm-text-secondary" />
                    </div>
                    <div className="text-center px-4">
                      <p className="text-lg font-semibold text-warm-text-primary mb-1">
                        {selectedFile.name}
                      </p>
                      <p className="text-sm text-warm-text-muted">
                        {(selectedFile.size / 1024 / 1024).toFixed(2)} MB
                      </p>
                    </div>
                  </>
                ) : (
                  <>
                    <div className="p-5 bg-warm-text-secondary/10 rounded-full">
                      <UploadCloud size={40} className="text-warm-text-secondary" />
                    </div>
                    <div className="text-center px-4">
                      <p className="text-lg font-semibold text-warm-text-primary mb-2">
                        Drag & drop invoice here
                      </p>
                      <p className="text-sm text-warm-text-muted mb-3">
                        or{' '}
                        <span className="text-warm-text-secondary font-medium">
                          browse files
                        </span>
                      </p>
                      <p className="text-xs text-warm-text-muted">
                        Supports PDF, PNG, JPG (up to 50MB)
                      </p>
                    </div>
                  </>
                )}
              </div>
            </div>
            <input
              ref={fileInputRef}
              type="file"
              accept=".pdf,image/*"
              className="hidden"
              onChange={(e) => handleFileSelect(e.target.files?.[0] || null)}
            />
            {error && (
              <div className="mt-5 p-4 bg-red-50 border border-red-200 rounded-lg flex items-start gap-3">
                <AlertCircle size={18} className="text-red-600 flex-shrink-0 mt-0.5" />
                <span className="text-sm text-red-800 font-medium">{error}</span>
              </div>
            )}
            {selectedFile && !isProcessing && (
              <div className="mt-6 flex gap-3 justify-end">
                <Button variant="secondary" onClick={handleReset}>
                  Cancel
                </Button>
                <Button onClick={handleProcess}>Process Invoice</Button>
              </div>
            )}
          </div>
        </div>
      )}
      {/* Results */}
      {result && (
        <div className="space-y-6">
          {/* Status Header */}
          <div className="bg-warm-card rounded-xl border border-warm-border shadow-sm overflow-hidden">
            <div className="p-6 flex items-center justify-between border-b border-warm-divider">
              <div className="flex items-center gap-4">
                {result.success ? (
                  <div className="p-3 bg-green-100 rounded-xl">
                    <CheckCircle2 size={28} className="text-green-600" />
                  </div>
                ) : (
                  <div className="p-3 bg-yellow-100 rounded-xl">
                    <AlertCircle size={28} className="text-yellow-600" />
                  </div>
                )}
                <div>
                  <h3 className="text-xl font-bold text-warm-text-primary">
                    {result.success ? 'Extraction Complete' : 'Partial Results'}
                  </h3>
                  <p className="text-sm text-warm-text-muted mt-0.5">
                    Document ID: <span className="font-mono">{result.document_id}</span>
                  </p>
                </div>
              </div>
              <Button variant="secondary" onClick={handleReset}>
                Process Another
              </Button>
            </div>
            <div className="px-6 py-4 bg-warm-bg/50 flex items-center gap-6 text-sm">
              <div className="flex items-center gap-2 text-warm-text-secondary">
                <Clock size={16} />
                <span className="font-medium">
                  {result.processing_time_ms.toFixed(0)}ms
                </span>
              </div>
              {result.fallback_used && (
                <span className="px-3 py-1.5 bg-warm-selected rounded-md text-warm-text-secondary font-medium text-xs">
                  Fallback OCR Used
                </span>
              )}
            </div>
          </div>
          {/* Main Content Grid */}
          <div className="grid grid-cols-1 lg:grid-cols-3 gap-6">
            {/* Left Column: Extracted Fields */}
            <div className="lg:col-span-2 space-y-6">
              <div className="bg-warm-card rounded-xl border border-warm-border p-6 shadow-sm">
                <h3 className="text-lg font-bold text-warm-text-primary mb-5 flex items-center gap-2">
                  <span className="w-1 h-5 bg-warm-text-secondary rounded-full"></span>
                  Extracted Fields
                </h3>
                <div className="flex flex-wrap gap-4">
                  {Object.entries(result.fields).map(([field, value]) => {
                    const confidence = result.confidence[field]
                    return (
                      <div
                        key={field}
                        className="p-4 bg-warm-bg/70 rounded-lg border border-warm-divider hover:border-warm-text-secondary/30 transition-colors w-[calc(50%-0.5rem)]"
                      >
                        <div className="text-xs font-semibold text-warm-text-muted uppercase tracking-wide mb-2">
                          {formatFieldName(field)}
                        </div>
                        <div className="text-sm font-bold text-warm-text-primary mb-2 min-h-[1.5rem]">
                          {value || <span className="text-warm-text-muted italic">N/A</span>}
                        </div>
                        {confidence && (
                          <div className="flex items-center gap-1.5 text-xs font-medium text-warm-text-secondary">
                            <CheckCircle2 size={13} />
                            <span>{(confidence * 100).toFixed(1)}%</span>
                          </div>
                        )}
                      </div>
                    )
                  })}
                </div>
              </div>
              {/* Visualization */}
              {result.visualization_url && (
                <div className="bg-warm-card rounded-xl border border-warm-border p-6 shadow-sm">
                  <h3 className="text-lg font-bold text-warm-text-primary mb-5 flex items-center gap-2">
                    <span className="w-1 h-5 bg-warm-text-secondary rounded-full"></span>
                    Detection Visualization
                  </h3>
                  <div className="bg-warm-bg rounded-lg overflow-hidden border border-warm-divider">
                    <img
                      src={`${import.meta.env.VITE_API_URL || 'http://localhost:8000'}${result.visualization_url}`}
                      alt="Detection visualization"
                      className="w-full h-auto"
                    />
                  </div>
                </div>
              )}
            </div>
            {/* Right Column: Cross-Validation & Errors */}
            <div className="space-y-6">
              {/* Cross-Validation */}
              {result.cross_validation && (
                <div className="bg-warm-card rounded-xl border border-warm-border p-6 shadow-sm">
                  <h3 className="text-lg font-bold text-warm-text-primary mb-4 flex items-center gap-2">
                    <span className="w-1 h-5 bg-warm-text-secondary rounded-full"></span>
                    Payment Line Validation
                  </h3>
                  <div
                    className={`
                      p-4 rounded-lg mb-4 flex items-center gap-3
                      ${result.cross_validation.is_valid
                        ? 'bg-green-50 border border-green-200'
                        : 'bg-yellow-50 border border-yellow-200'
                      }
                    `}
                  >
                    {result.cross_validation.is_valid ? (
                      <>
                        <CheckCircle2 size={22} className="text-green-600 flex-shrink-0" />
                        <span className="font-bold text-green-800">All Fields Match</span>
                      </>
                    ) : (
                      <>
                        <AlertCircle size={22} className="text-yellow-600 flex-shrink-0" />
                        <span className="font-bold text-yellow-800">Mismatch Detected</span>
                      </>
                    )}
                  </div>
                  <div className="space-y-2.5">
                    {result.cross_validation.payment_line_ocr && (
                      <div
                        className={`
                          p-3 rounded-lg border transition-colors
                          ${result.cross_validation.ocr_match === true
                            ? 'bg-green-50 border-green-200'
                            : result.cross_validation.ocr_match === false
                            ? 'bg-red-50 border-red-200'
                            : 'bg-warm-bg border-warm-divider'
                          }
                        `}
                      >
                        <div className="flex items-center justify-between">
                          <div className="flex-1">
                            <div className="text-xs font-semibold text-warm-text-muted mb-1">
                              OCR NUMBER
                            </div>
                            <div className="text-sm font-bold text-warm-text-primary font-mono">
                              {result.cross_validation.payment_line_ocr}
                            </div>
                          </div>
                          {result.cross_validation.ocr_match === true && (
                            <CheckCircle2 size={16} className="text-green-600" />
                          )}
                          {result.cross_validation.ocr_match === false && (
                            <AlertCircle size={16} className="text-red-600" />
                          )}
                        </div>
                      </div>
                    )}
                    {result.cross_validation.payment_line_amount && (
                      <div
                        className={`
                          p-3 rounded-lg border transition-colors
                          ${result.cross_validation.amount_match === true
                            ? 'bg-green-50 border-green-200'
                            : result.cross_validation.amount_match === false
                            ? 'bg-red-50 border-red-200'
                            : 'bg-warm-bg border-warm-divider'
                          }
                        `}
                      >
                        <div className="flex items-center justify-between">
                          <div className="flex-1">
                            <div className="text-xs font-semibold text-warm-text-muted mb-1">
                              AMOUNT
                            </div>
                            <div className="text-sm font-bold text-warm-text-primary font-mono">
                              {result.cross_validation.payment_line_amount}
                            </div>
                          </div>
                          {result.cross_validation.amount_match === true && (
                            <CheckCircle2 size={16} className="text-green-600" />
                          )}
                          {result.cross_validation.amount_match === false && (
                            <AlertCircle size={16} className="text-red-600" />
                          )}
                        </div>
                      </div>
                    )}
                    {result.cross_validation.payment_line_account && (
                      <div
                        className={`
                          p-3 rounded-lg border transition-colors
                          ${(result.cross_validation.payment_line_account_type === 'bankgiro'
                              ? result.cross_validation.bankgiro_match
                              : result.cross_validation.plusgiro_match) === true
                            ? 'bg-green-50 border-green-200'
                            : (result.cross_validation.payment_line_account_type === 'bankgiro'
                                ? result.cross_validation.bankgiro_match
                                : result.cross_validation.plusgiro_match) === false
                            ? 'bg-red-50 border-red-200'
                            : 'bg-warm-bg border-warm-divider'
                          }
                        `}
                      >
                        <div className="flex items-center justify-between">
                          <div className="flex-1">
                            <div className="text-xs font-semibold text-warm-text-muted mb-1">
                              {result.cross_validation.payment_line_account_type === 'bankgiro'
                                ? 'BANKGIRO'
                                : 'PLUSGIRO'}
                            </div>
                            <div className="text-sm font-bold text-warm-text-primary font-mono">
                              {result.cross_validation.payment_line_account}
                            </div>
                          </div>
                          {(result.cross_validation.payment_line_account_type === 'bankgiro'
                            ? result.cross_validation.bankgiro_match
                            : result.cross_validation.plusgiro_match) === true && (
                            <CheckCircle2 size={16} className="text-green-600" />
                          )}
                          {(result.cross_validation.payment_line_account_type === 'bankgiro'
                            ? result.cross_validation.bankgiro_match
                            : result.cross_validation.plusgiro_match) === false && (
                            <AlertCircle size={16} className="text-red-600" />
                          )}
                        </div>
                      </div>
                    )}
                  </div>
                  {result.cross_validation.details.length > 0 && (
                    <div className="mt-4 p-3 bg-warm-bg/70 rounded-lg text-xs text-warm-text-secondary leading-relaxed border border-warm-divider">
                      {result.cross_validation.details[result.cross_validation.details.length - 1]}
                    </div>
                  )}
                </div>
              )}
              {/* Errors */}
              {result.errors.length > 0 && (
                <div className="bg-warm-card rounded-xl border border-warm-border p-6 shadow-sm">
                  <h3 className="text-lg font-bold text-warm-text-primary mb-4 flex items-center gap-2">
                    <span className="w-1 h-5 bg-red-500 rounded-full"></span>
                    Issues
                  </h3>
                  <div className="space-y-2.5">
                    {result.errors.map((err, idx) => (
                      <div
                        key={idx}
                        className="p-3 bg-yellow-50 border border-yellow-200 rounded-lg flex items-start gap-3"
                      >
                        <AlertCircle size={16} className="text-yellow-600 flex-shrink-0 mt-0.5" />
                        <span className="text-xs text-yellow-800 leading-relaxed">{err}</span>
                      </div>
                    ))}
                  </div>
                </div>
              )}
            </div>
          </div>
        </div>
      )}
    </div>
  )
 }
--- a/frontend/src/components/Layout.tsx
+++ b/frontend/src/components/Layout.tsx
@@ -0,0 +1,102 @@
 import React, { useState } from 'react';
 import { Box, LayoutTemplate, Users, BookOpen, LogOut, Sparkles } from 'lucide-react';
 interface LayoutProps {
  children: React.ReactNode;
  activeView: string;
  onNavigate: (view: string) => void;
  onLogout?: () => void;
 }
 export const Layout: React.FC<LayoutProps> = ({ children, activeView, onNavigate, onLogout }) => {
  const [showDropdown, setShowDropdown] = useState(false);
  const navItems = [
    { id: 'dashboard', label: 'Dashboard', icon: LayoutTemplate },
    { id: 'demo', label: 'Demo', icon: Sparkles },
    { id: 'training', label: 'Training', icon: Box }, // Mapped to Compliants visually in prompt, using logical name
    { id: 'documents', label: 'Documents', icon: BookOpen },
    { id: 'models', label: 'Models', icon: Users }, // Contacts in prompt, mapped to models for this use case
  ];
  return (
    <div className="min-h-screen bg-warm-bg font-sans text-warm-text-primary flex flex-col">
      {/* Top Navigation */}
      <nav className="h-14 bg-warm-bg border-b border-warm-border px-6 flex items-center justify-between shrink-0 sticky top-0 z-40">
        <div className="flex items-center gap-8">
          {/* Logo */}
          <div className="flex items-center gap-2">
            <div className="w-8 h-8 bg-warm-text-primary rounded-full flex items-center justify-center text-white">
              <svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="3" strokeLinecap="round" strokeLinejoin="round">
                <path d="M12 2L2 7l10 5 10-5-10-5zM2 17l10 5 10-5M2 12l10 5 10-5"/>
              </svg>
            </div>
          </div>
          {/* Nav Links */}
          <div className="flex h-14">
            {navItems.map(item => {
              const isActive = activeView === item.id || (activeView === 'detail' && item.id === 'documents');
              return (
                <button
                  key={item.id}
                  onClick={() => onNavigate(item.id)}
                  className={`
                    relative px-4 h-full flex items-center text-sm font-medium transition-colors
                    ${isActive ? 'text-warm-text-primary' : 'text-warm-text-muted hover:text-warm-text-secondary'}
                  `}
                >
                  {item.label}
                  {isActive && (
                    <div className="absolute bottom-0 left-0 right-0 h-0.5 bg-warm-text-secondary rounded-t-full mx-2" />
                  )}
                </button>
              );
            })}
          </div>
        </div>
        {/* User Profile */}
        <div className="flex items-center gap-3 pl-6 border-l border-warm-border h-6 relative">
           <button
             onClick={() => setShowDropdown(!showDropdown)}
             className="w-8 h-8 rounded-full bg-warm-selected flex items-center justify-center text-xs font-semibold text-warm-text-secondary border border-warm-divider hover:bg-warm-hover transition-colors"
           >
             AD
           </button>
           {showDropdown && (
             <>
               <div
                 className="fixed inset-0 z-10"
                 onClick={() => setShowDropdown(false)}
               />
               <div className="absolute right-0 top-10 w-48 bg-warm-card border border-warm-border rounded-lg shadow-modal z-20">
                 <div className="p-3 border-b border-warm-border">
                   <p className="text-sm font-medium text-warm-text-primary">Admin User</p>
                   <p className="text-xs text-warm-text-muted mt-0.5">Authenticated</p>
                 </div>
                 {onLogout && (
                   <button
                     onClick={() => {
                       setShowDropdown(false)
                       onLogout()
                     }}
                     className="w-full px-3 py-2 text-left text-sm text-warm-text-secondary hover:bg-warm-hover transition-colors flex items-center gap-2"
                   >
                     <LogOut size={14} />
                     Sign Out
                   </button>
                 )}
               </div>
             </>
           )}
        </div>
      </nav>
      {/* Main Content */}
      <main className="flex-1 overflow-auto">
        {children}
      </main>
    </div>
  );
 };
--- a/frontend/src/components/Login.tsx
+++ b/frontend/src/components/Login.tsx
@@ -0,0 +1,188 @@
 import React, { useState } from 'react'
 import { Button } from './Button'
 interface LoginProps {
  onLogin: (token: string) => void
 }
 export const Login: React.FC<LoginProps> = ({ onLogin }) => {
  const [token, setToken] = useState('')
  const [name, setName] = useState('')
  const [description, setDescription] = useState('')
  const [isCreating, setIsCreating] = useState(false)
  const [error, setError] = useState('')
  const [createdToken, setCreatedToken] = useState('')
  const handleLoginWithToken = () => {
    if (!token.trim()) {
      setError('Please enter a token')
      return
    }
    localStorage.setItem('admin_token', token.trim())
    onLogin(token.trim())
  }
  const handleCreateToken = async () => {
    if (!name.trim()) {
      setError('Please enter a token name')
      return
    }
    setIsCreating(true)
    setError('')
    try {
      const response = await fetch('http://localhost:8000/api/v1/admin/auth/token', {
        method: 'POST',
        headers: {
          'Content-Type': 'application/json',
        },
        body: JSON.stringify({
          name: name.trim(),
          description: description.trim() || undefined,
        }),
      })
      if (!response.ok) {
        throw new Error('Failed to create token')
      }
      const data = await response.json()
      setCreatedToken(data.token)
      setToken(data.token)
      setError('')
    } catch (err) {
      setError('Failed to create token. Please check your connection.')
      console.error(err)
    } finally {
      setIsCreating(false)
    }
  }
  const handleUseCreatedToken = () => {
    if (createdToken) {
      localStorage.setItem('admin_token', createdToken)
      onLogin(createdToken)
    }
  }
  return (
    <div className="min-h-screen bg-warm-bg flex items-center justify-center p-4">
      <div className="bg-warm-card border border-warm-border rounded-lg shadow-modal p-8 max-w-md w-full">
        <h1 className="text-2xl font-bold text-warm-text-primary mb-2">
          Admin Authentication
        </h1>
        <p className="text-sm text-warm-text-muted mb-6">
          Sign in with an admin token to access the document management system
        </p>
        {error && (
          <div className="mb-4 p-3 bg-red-50 border border-red-200 text-red-800 rounded text-sm">
            {error}
          </div>
        )}
        {createdToken && (
          <div className="mb-4 p-3 bg-green-50 border border-green-200 rounded">
            <p className="text-sm font-medium text-green-800 mb-2">Token created successfully!</p>
            <div className="bg-white border border-green-300 rounded p-2 mb-3">
              <code className="text-xs font-mono text-warm-text-primary break-all">
                {createdToken}
              </code>
            </div>
            <p className="text-xs text-green-700 mb-3">
              Save this token securely. You won't be able to see it again.
            </p>
            <Button onClick={handleUseCreatedToken} className="w-full">
              Use This Token
            </Button>
          </div>
        )}
        <div className="space-y-6">
          {/* Login with existing token */}
          <div>
            <h2 className="text-sm font-semibold text-warm-text-secondary mb-3">
              Sign in with existing token
            </h2>
            <div className="space-y-3">
              <div>
                <label className="block text-sm text-warm-text-secondary mb-1">
                  Admin Token
                </label>
                <input
                  type="text"
                  value={token}
                  onChange={(e) => setToken(e.target.value)}
                  placeholder="Enter your admin token"
                  className="w-full px-3 py-2 border border-warm-border rounded-md text-sm focus:outline-none focus:ring-1 focus:ring-warm-state-info font-mono"
                  onKeyDown={(e) => e.key === 'Enter' && handleLoginWithToken()}
                />
              </div>
              <Button onClick={handleLoginWithToken} className="w-full">
                Sign In
              </Button>
            </div>
          </div>
          <div className="relative">
            <div className="absolute inset-0 flex items-center">
              <div className="w-full border-t border-warm-border"></div>
            </div>
            <div className="relative flex justify-center text-xs">
              <span className="px-2 bg-warm-card text-warm-text-muted">OR</span>
            </div>
          </div>
          {/* Create new token */}
          <div>
            <h2 className="text-sm font-semibold text-warm-text-secondary mb-3">
              Create new admin token
            </h2>
            <div className="space-y-3">
              <div>
                <label className="block text-sm text-warm-text-secondary mb-1">
                  Token Name <span className="text-red-500">*</span>
                </label>
                <input
                  type="text"
                  value={name}
                  onChange={(e) => setName(e.target.value)}
                  placeholder="e.g., my-laptop"
                  className="w-full px-3 py-2 border border-warm-border rounded-md text-sm focus:outline-none focus:ring-1 focus:ring-warm-state-info"
                />
              </div>
              <div>
                <label className="block text-sm text-warm-text-secondary mb-1">
                  Description (optional)
                </label>
                <input
                  type="text"
                  value={description}
                  onChange={(e) => setDescription(e.target.value)}
                  placeholder="e.g., Personal laptop access"
                  className="w-full px-3 py-2 border border-warm-border rounded-md text-sm focus:outline-none focus:ring-1 focus:ring-warm-state-info"
                />
              </div>
              <Button
                onClick={handleCreateToken}
                variant="secondary"
                disabled={isCreating}
                className="w-full"
              >
                {isCreating ? 'Creating...' : 'Create Token'}
              </Button>
            </div>
          </div>
        </div>
        <div className="mt-6 pt-4 border-t border-warm-border">
          <p className="text-xs text-warm-text-muted">
            Admin tokens are used to authenticate with the document management API.
            Keep your tokens secure and never share them.
          </p>
        </div>
      </div>
    </div>
  )
 }
--- a/frontend/src/components/Models.tsx
+++ b/frontend/src/components/Models.tsx
@@ -0,0 +1,134 @@
 import React from 'react';
 import { BarChart, Bar, XAxis, YAxis, CartesianGrid, Tooltip, ResponsiveContainer } from 'recharts';
 import { Button } from './Button';
 const CHART_DATA = [
  { name: 'Model A', value: 75 },
  { name: 'Model B', value: 82 },
  { name: 'Model C', value: 95 },
  { name: 'Model D', value: 68 },
 ];
 const METRICS_DATA = [
  { name: 'Precision', value: 88 },
  { name: 'Recall', value: 76 },
  { name: 'F1 Score', value: 91 },
  { name: 'Accuracy', value: 82 },
 ];
 const JOBS = [
  { id: 1, name: 'Training Job Job 1', date: '12/29/2024 10:33 PM', status: 'Running', progress: 65 },
  { id: 2, name: 'Training Job 2', date: '12/29/2024 10:33 PM', status: 'Completed', success: 37, metrics: 89 },
  { id: 3, name: 'Model Training Compentr 1', date: '12/29/2024 10:19 PM', status: 'Completed', success: 87, metrics: 92 },
 ];
 export const Models: React.FC = () => {
  return (
    <div className="p-8 max-w-7xl mx-auto flex gap-8">
      {/* Left: Job History */}
      <div className="flex-1">
        <h2 className="text-2xl font-bold text-warm-text-primary mb-6">Models & History</h2>
        <h3 className="text-lg font-semibold text-warm-text-primary mb-4">Recent Training Jobs</h3>
        <div className="space-y-4">
          {JOBS.map(job => (
            <div key={job.id} className="bg-warm-card border border-warm-border rounded-lg p-5 shadow-sm hover:border-warm-divider transition-colors">
              <div className="flex justify-between items-start mb-2">
                 <div>
                   <h4 className="font-semibold text-warm-text-primary text-lg mb-1">{job.name}</h4>
                   <p className="text-sm text-warm-text-muted">Started {job.date}</p>
                 </div>
                 <span className={`px-3 py-1 rounded-full text-xs font-medium ${job.status === 'Running' ? 'bg-warm-selected text-warm-text-secondary' : 'bg-warm-selected text-warm-state-success'}`}>
                   {job.status}
                 </span>
              </div>
              {job.status === 'Running' ? (
                <div className="mt-4">
                  <div className="h-2 w-full bg-warm-selected rounded-full overflow-hidden">
                    <div className="h-full bg-warm-text-secondary w-[65%] rounded-full"></div>
                  </div>
                </div>
              ) : (
                <div className="mt-4 flex gap-8">
                   <div>
                     <span className="block text-xs text-warm-text-muted uppercase tracking-wide">Success</span>
                     <span className="text-lg font-mono text-warm-text-secondary">{job.success}</span>
                   </div>
                   <div>
                     <span className="block text-xs text-warm-text-muted uppercase tracking-wide">Performance</span>
                     <span className="text-lg font-mono text-warm-text-secondary">{job.metrics}%</span>
                   </div>
                   <div>
                     <span className="block text-xs text-warm-text-muted uppercase tracking-wide">Completed</span>
                     <span className="text-lg font-mono text-warm-text-secondary">100%</span>
                   </div>
                </div>
              )}
            </div>
          ))}
        </div>
      </div>
      {/* Right: Model Detail */}
      <div className="w-[400px]">
         <div className="bg-warm-card border border-warm-border rounded-lg p-6 shadow-card sticky top-8">
            <div className="flex justify-between items-center mb-6">
              <h3 className="text-xl font-bold text-warm-text-primary">Model Detail</h3>
              <span className="text-sm font-medium text-warm-state-success">Completed</span>
            </div>
            <div className="mb-8">
              <p className="text-sm text-warm-text-muted mb-1">Model name</p>
              <p className="font-medium text-warm-text-primary">Invoices Q4 v2.1</p>
            </div>
            <div className="space-y-8">
               {/* Chart 1 */}
               <div>
                  <h4 className="text-sm font-semibold text-warm-text-secondary mb-4">Bar Rate Metrics</h4>
                  <div className="h-40">
                    <ResponsiveContainer width="100%" height="100%">
                      <BarChart data={CHART_DATA}>
                        <CartesianGrid strokeDasharray="3 3" vertical={false} stroke="#E6E4E1" />
                        <XAxis dataKey="name" hide />
                        <YAxis hide domain={[0, 100]} />
                        <Tooltip 
                          cursor={{fill: '#F1F0ED'}} 
                          contentStyle={{borderRadius: '8px', border: '1px solid #E6E4E1', boxShadow: '0 2px 5px rgba(0,0,0,0.05)'}}
                        />
                        <Bar dataKey="value" fill="#3A3A3A" radius={[4, 4, 0, 0]} barSize={32} />
                      </BarChart>
                    </ResponsiveContainer>
                  </div>
               </div>
               {/* Chart 2 */}
               <div>
                  <h4 className="text-sm font-semibold text-warm-text-secondary mb-4">Entity Extraction Accuracy</h4>
                  <div className="h-40">
                    <ResponsiveContainer width="100%" height="100%">
                      <BarChart data={METRICS_DATA}>
                        <CartesianGrid strokeDasharray="3 3" vertical={false} stroke="#E6E4E1" />
                        <XAxis dataKey="name" tick={{fontSize: 10, fill: '#6B6B6B'}} axisLine={false} tickLine={false} />
                        <YAxis hide domain={[0, 100]} />
                        <Tooltip cursor={{fill: '#F1F0ED'}} />
                        <Bar dataKey="value" fill="#3A3A3A" radius={[4, 4, 0, 0]} barSize={32} />
                      </BarChart>
                    </ResponsiveContainer>
                  </div>
               </div>
            </div>
            <div className="mt-8 space-y-3">
              <Button className="w-full">Download Model</Button>
              <div className="flex gap-3">
                 <Button variant="secondary" className="flex-1">View Logs</Button>
                 <Button variant="secondary" className="flex-1">Use as Base</Button>
              </div>
            </div>
         </div>
      </div>
    </div>
  );
 };
--- a/frontend/src/components/Training.tsx
+++ b/frontend/src/components/Training.tsx
@@ -0,0 +1,113 @@
 import React, { useState } from 'react';
 import { Check, AlertCircle } from 'lucide-react';
 import { Button } from './Button';
 import { DocumentStatus } from '../types';
 export const Training: React.FC = () => {
  const [split, setSplit] = useState(80);
  const docs = [
    { id: '1', name: 'Document Document 1', date: '12/28/2024', status: DocumentStatus.VERIFIED },
    { id: '2', name: 'Document Document 2', date: '12/29/2024', status: DocumentStatus.VERIFIED },
    { id: '3', name: 'Document Document 3', date: '12/29/2024', status: DocumentStatus.VERIFIED },
    { id: '4', name: 'Document Document 4', date: '12/29/2024', status: DocumentStatus.PARTIAL },
    { id: '5', name: 'Document Document 5', date: '12/29/2024', status: DocumentStatus.PARTIAL },
    { id: '6', name: 'Document Document 6', date: '12/29/2024', status: DocumentStatus.PARTIAL },
    { id: '8', name: 'Document Document 8', date: '12/29/2024', status: DocumentStatus.VERIFIED },
  ];
  return (
    <div className="p-8 max-w-7xl mx-auto h-[calc(100vh-56px)] flex gap-8">
      {/* Document Selection List */}
      <div className="flex-1 flex flex-col">
        <h2 className="text-2xl font-bold text-warm-text-primary mb-6">Document Selection</h2>
        <div className="flex-1 bg-warm-card border border-warm-border rounded-lg overflow-hidden flex flex-col shadow-sm">
           <div className="overflow-auto flex-1">
             <table className="w-full text-left">
              <thead className="sticky top-0 bg-white border-b border-warm-border z-10">
                <tr>
                   <th className="py-3 pl-6 pr-4 w-12"><input type="checkbox" className="rounded border-warm-divider"/></th>
                   <th className="py-3 px-4 text-xs font-semibold text-warm-text-muted uppercase">Document name</th>
                   <th className="py-3 px-4 text-xs font-semibold text-warm-text-muted uppercase">Date</th>
                   <th className="py-3 px-4 text-xs font-semibold text-warm-text-muted uppercase">Status</th>
                </tr>
              </thead>
              <tbody>
                {docs.map(doc => (
                  <tr key={doc.id} className="border-b border-warm-border hover:bg-warm-hover transition-colors">
                     <td className="py-3 pl-6 pr-4"><input type="checkbox" defaultChecked className="rounded border-warm-divider accent-warm-state-info"/></td>
                     <td className="py-3 px-4 text-sm font-medium text-warm-text-secondary">{doc.name}</td>
                     <td className="py-3 px-4 text-sm text-warm-text-muted font-mono">{doc.date}</td>
                     <td className="py-3 px-4">
                       {doc.status === DocumentStatus.VERIFIED ? (
                         <div className="flex items-center text-warm-state-success text-sm font-medium">
                            <div className="w-5 h-5 rounded-full bg-warm-state-success flex items-center justify-center text-white mr-2">
                              <Check size={12} strokeWidth={3}/>
                            </div>
                            Verified
                         </div>
                       ) : (
                         <div className="flex items-center text-warm-text-muted text-sm">
                            <div className="w-5 h-5 rounded-full bg-[#BDBBB5] flex items-center justify-center text-white mr-2">
                              <span className="font-bold text-[10px]">!</span>
                            </div>
                            Partial
                         </div>
                       )}
                     </td>
                  </tr>
                ))}
              </tbody>
             </table>
           </div>
        </div>
      </div>
      {/* Configuration Panel */}
      <div className="w-96">
        <div className="bg-warm-card rounded-lg border border-warm-border shadow-card p-6 sticky top-8">
          <h3 className="text-lg font-semibold text-warm-text-primary mb-6">Training Configuration</h3>
          <div className="space-y-6">
            <div>
              <label className="block text-sm font-medium text-warm-text-secondary mb-2">Model Name</label>
              <input 
                type="text" 
                placeholder="e.g. Invoices Q4"
                className="w-full h-10 px-3 rounded-md border border-warm-divider bg-white text-warm-text-primary focus:outline-none focus:ring-1 focus:ring-warm-state-info"
              />
            </div>
            <div>
              <label className="block text-sm font-medium text-warm-text-secondary mb-2">Base Model</label>
              <select className="w-full h-10 px-3 rounded-md border border-warm-divider bg-white text-warm-text-primary focus:outline-none focus:ring-1 focus:ring-warm-state-info appearance-none">
                 <option>LayoutLMv3 (Standard)</option>
                 <option>Donut (Beta)</option>
              </select>
            </div>
            <div>
              <div className="flex justify-between mb-2">
                <label className="block text-sm font-medium text-warm-text-secondary">Train/Test Split</label>
                <span className="text-xs font-mono text-warm-text-muted">{split}% / {100-split}%</span>
              </div>
              <input 
                type="range" 
                min="50" 
                max="95" 
                value={split} 
                onChange={(e) => setSplit(parseInt(e.target.value))}
                className="w-full h-1.5 bg-warm-border rounded-lg appearance-none cursor-pointer accent-warm-state-info"
              />
            </div>
            <div className="pt-4 border-t border-warm-border">
              <Button className="w-full h-12">Start Training</Button>
            </div>
          </div>
        </div>
      </div>
    </div>
  );
 };
--- a/frontend/src/components/UploadModal.tsx
+++ b/frontend/src/components/UploadModal.tsx
@@ -0,0 +1,210 @@
 import React, { useState, useRef } from 'react'
 import { X, UploadCloud, File, CheckCircle, AlertCircle } from 'lucide-react'
 import { Button } from './Button'
 import { useDocuments } from '../hooks/useDocuments'
 interface UploadModalProps {
  isOpen: boolean
  onClose: () => void
 }
 export const UploadModal: React.FC<UploadModalProps> = ({ isOpen, onClose }) => {
  const [isDragging, setIsDragging] = useState(false)
  const [selectedFiles, setSelectedFiles] = useState<File[]>([])
  const [uploadStatus, setUploadStatus] = useState<'idle' | 'uploading' | 'success' | 'error'>('idle')
  const [errorMessage, setErrorMessage] = useState('')
  const fileInputRef = useRef<HTMLInputElement>(null)
  const { uploadDocument, isUploading } = useDocuments({})
  if (!isOpen) return null
  const handleFileSelect = (files: FileList | null) => {
    if (!files) return
    const pdfFiles = Array.from(files).filter(file => {
      const isPdf = file.type === 'application/pdf'
      const isImage = file.type.startsWith('image/')
      const isUnder25MB = file.size <= 25 * 1024 * 1024
      return (isPdf || isImage) && isUnder25MB
    })
    setSelectedFiles(prev => [...prev, ...pdfFiles])
    setUploadStatus('idle')
    setErrorMessage('')
  }
  const handleDrop = (e: React.DragEvent) => {
    e.preventDefault()
    setIsDragging(false)
    handleFileSelect(e.dataTransfer.files)
  }
  const handleBrowseClick = () => {
    fileInputRef.current?.click()
  }
  const removeFile = (index: number) => {
    setSelectedFiles(prev => prev.filter((_, i) => i !== index))
  }
  const handleUpload = async () => {
    if (selectedFiles.length === 0) {
      setErrorMessage('Please select at least one file')
      return
    }
    setUploadStatus('uploading')
    setErrorMessage('')
    try {
      // Upload files one by one
      for (const file of selectedFiles) {
        await new Promise<void>((resolve, reject) => {
          uploadDocument(file, {
            onSuccess: () => resolve(),
            onError: (error: Error) => reject(error),
          })
        })
      }
      setUploadStatus('success')
      setTimeout(() => {
        onClose()
        setSelectedFiles([])
        setUploadStatus('idle')
      }, 1500)
    } catch (error) {
      setUploadStatus('error')
      setErrorMessage(error instanceof Error ? error.message : 'Upload failed')
    }
  }
  const handleClose = () => {
    if (uploadStatus === 'uploading') {
      return // Prevent closing during upload
    }
    setSelectedFiles([])
    setUploadStatus('idle')
    setErrorMessage('')
    onClose()
  }
  return (
    <div className="fixed inset-0 z-50 flex items-center justify-center bg-black/20 backdrop-blur-sm transition-opacity duration-200">
      <div
        className="w-full max-w-lg bg-warm-card rounded-lg shadow-modal border border-warm-border transform transition-all duration-200 scale-100 p-6"
        onClick={(e) => e.stopPropagation()}
      >
        <div className="flex items-center justify-between mb-6">
          <h3 className="text-xl font-semibold text-warm-text-primary">Upload Documents</h3>
          <button
            onClick={handleClose}
            className="text-warm-text-muted hover:text-warm-text-primary transition-colors disabled:opacity-50"
            disabled={uploadStatus === 'uploading'}
          >
            <X size={20} />
          </button>
        </div>
        {/* Drop Zone */}
        <div
          className={`
            w-full h-48 rounded-lg border-2 border-dashed flex flex-col items-center justify-center gap-3 transition-colors duration-150 mb-6 cursor-pointer
            ${isDragging ? 'border-warm-text-secondary bg-warm-selected' : 'border-warm-divider bg-warm-bg hover:bg-warm-hover'}
            ${uploadStatus === 'uploading' ? 'opacity-50 pointer-events-none' : ''}
          `}
          onDragOver={(e) => { e.preventDefault(); setIsDragging(true); }}
          onDragLeave={() => setIsDragging(false)}
          onDrop={handleDrop}
          onClick={handleBrowseClick}
        >
          <div className="p-3 bg-white rounded-full shadow-sm">
            <UploadCloud size={24} className="text-warm-text-secondary" />
          </div>
          <div className="text-center">
            <p className="text-sm font-medium text-warm-text-primary">
              Drag & drop files here or <span className="underline decoration-1 underline-offset-2 hover:text-warm-state-info">Browse</span>
            </p>
            <p className="text-xs text-warm-text-muted mt-1">PDF, JPG, PNG up to 25MB</p>
          </div>
        </div>
        <input
          ref={fileInputRef}
          type="file"
          multiple
          accept=".pdf,image/*"
          className="hidden"
          onChange={(e) => handleFileSelect(e.target.files)}
        />
        {/* Selected Files */}
        {selectedFiles.length > 0 && (
          <div className="mb-6 max-h-40 overflow-y-auto">
            <p className="text-sm font-medium text-warm-text-secondary mb-2">
              Selected Files ({selectedFiles.length})
            </p>
            <div className="space-y-2">
              {selectedFiles.map((file, index) => (
                <div
                  key={index}
                  className="flex items-center justify-between p-2 bg-warm-bg rounded border border-warm-border"
                >
                  <div className="flex items-center gap-2 flex-1 min-w-0">
                    <File size={16} className="text-warm-text-muted flex-shrink-0" />
                    <span className="text-sm text-warm-text-secondary truncate">
                      {file.name}
                    </span>
                    <span className="text-xs text-warm-text-muted flex-shrink-0">
                      ({(file.size / 1024 / 1024).toFixed(2)} MB)
                    </span>
                  </div>
                  <button
                    onClick={() => removeFile(index)}
                    className="text-warm-text-muted hover:text-warm-state-error ml-2 flex-shrink-0"
                    disabled={uploadStatus === 'uploading'}
                  >
                    <X size={16} />
                  </button>
                </div>
              ))}
            </div>
          </div>
        )}
        {/* Status Messages */}
        {uploadStatus === 'success' && (
          <div className="mb-4 p-3 bg-green-50 border border-green-200 rounded flex items-center gap-2">
            <CheckCircle size={16} className="text-green-600" />
            <span className="text-sm text-green-800">Upload successful!</span>
          </div>
        )}
        {uploadStatus === 'error' && errorMessage && (
          <div className="mb-4 p-3 bg-red-50 border border-red-200 rounded flex items-center gap-2">
            <AlertCircle size={16} className="text-red-600" />
            <span className="text-sm text-red-800">{errorMessage}</span>
          </div>
        )}
        {/* Actions */}
        <div className="mt-8 flex justify-end gap-3">
          <Button
            variant="secondary"
            onClick={handleClose}
            disabled={uploadStatus === 'uploading'}
          >
            Cancel
          </Button>
          <Button
            onClick={handleUpload}
            disabled={selectedFiles.length === 0 || uploadStatus === 'uploading'}
          >
            {uploadStatus === 'uploading' ? 'Uploading...' : `Upload ${selectedFiles.length > 0 ? `(${selectedFiles.length})` : ''}`}
          </Button>
        </div>
      </div>
    </div>
  )
 }
--- a/frontend/src/hooks/index.ts
+++ b/frontend/src/hooks/index.ts
@@ -0,0 +1,4 @@
 export { useDocuments } from './useDocuments'
 export { useDocumentDetail } from './useDocumentDetail'
 export { useAnnotations } from './useAnnotations'
 export { useTraining, useTrainingDocuments } from './useTraining'
--- a/frontend/src/hooks/useAnnotations.ts
+++ b/frontend/src/hooks/useAnnotations.ts
@@ -0,0 +1,70 @@
 import { useMutation, useQueryClient } from '@tanstack/react-query'
 import { annotationsApi } from '../api/endpoints'
 import type { CreateAnnotationRequest, AnnotationOverrideRequest } from '../api/types'
 export const useAnnotations = (documentId: string) => {
  const queryClient = useQueryClient()
  const createMutation = useMutation({
    mutationFn: (annotation: CreateAnnotationRequest) =>
      annotationsApi.create(documentId, annotation),
    onSuccess: () => {
      queryClient.invalidateQueries({ queryKey: ['document', documentId] })
    },
  })
  const updateMutation = useMutation({
    mutationFn: ({
      annotationId,
      updates,
    }: {
      annotationId: string
      updates: Partial<CreateAnnotationRequest>
    }) => annotationsApi.update(documentId, annotationId, updates),
    onSuccess: () => {
      queryClient.invalidateQueries({ queryKey: ['document', documentId] })
    },
  })
  const deleteMutation = useMutation({
    mutationFn: (annotationId: string) =>
      annotationsApi.delete(documentId, annotationId),
    onSuccess: () => {
      queryClient.invalidateQueries({ queryKey: ['document', documentId] })
    },
  })
  const verifyMutation = useMutation({
    mutationFn: (annotationId: string) =>
      annotationsApi.verify(documentId, annotationId),
    onSuccess: () => {
      queryClient.invalidateQueries({ queryKey: ['document', documentId] })
    },
  })
  const overrideMutation = useMutation({
    mutationFn: ({
      annotationId,
      overrideData,
    }: {
      annotationId: string
      overrideData: AnnotationOverrideRequest
    }) => annotationsApi.override(documentId, annotationId, overrideData),
    onSuccess: () => {
      queryClient.invalidateQueries({ queryKey: ['document', documentId] })
    },
  })
  return {
    createAnnotation: createMutation.mutate,
    isCreating: createMutation.isPending,
    updateAnnotation: updateMutation.mutate,
    isUpdating: updateMutation.isPending,
    deleteAnnotation: deleteMutation.mutate,
    isDeleting: deleteMutation.isPending,
    verifyAnnotation: verifyMutation.mutate,
    isVerifying: verifyMutation.isPending,
    overrideAnnotation: overrideMutation.mutate,
    isOverriding: overrideMutation.isPending,
  }
 }
--- a/frontend/src/hooks/useDocumentDetail.ts
+++ b/frontend/src/hooks/useDocumentDetail.ts
@@ -0,0 +1,25 @@
 import { useQuery } from '@tanstack/react-query'
 import { documentsApi } from '../api/endpoints'
 import type { DocumentDetailResponse } from '../api/types'
 export const useDocumentDetail = (documentId: string | null) => {
  const { data, isLoading, error, refetch } = useQuery<DocumentDetailResponse>({
    queryKey: ['document', documentId],
    queryFn: () => {
      if (!documentId) {
        throw new Error('Document ID is required')
      }
      return documentsApi.getDetail(documentId)
    },
    enabled: !!documentId,
    staleTime: 10000,
  })
  return {
    document: data || null,
    annotations: data?.annotations || [],
    isLoading,
    error,
    refetch,
  }
 }
--- a/frontend/src/hooks/useDocuments.ts
+++ b/frontend/src/hooks/useDocuments.ts
@@ -0,0 +1,78 @@
 import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query'
 import { documentsApi } from '../api/endpoints'
 import type { DocumentListResponse, UploadDocumentResponse } from '../api/types'
 interface UseDocumentsParams {
  status?: string
  limit?: number
  offset?: number
 }
 export const useDocuments = (params: UseDocumentsParams = {}) => {
  const queryClient = useQueryClient()
  const { data, isLoading, error, refetch } = useQuery<DocumentListResponse>({
    queryKey: ['documents', params],
    queryFn: () => documentsApi.list(params),
    staleTime: 30000,
  })
  const uploadMutation = useMutation({
    mutationFn: (file: File) => documentsApi.upload(file),
    onSuccess: () => {
      queryClient.invalidateQueries({ queryKey: ['documents'] })
    },
  })
  const batchUploadMutation = useMutation({
    mutationFn: ({ files, csvFile }: { files: File[]; csvFile?: File }) =>
      documentsApi.batchUpload(files, csvFile),
    onSuccess: () => {
      queryClient.invalidateQueries({ queryKey: ['documents'] })
    },
  })
  const deleteMutation = useMutation({
    mutationFn: (documentId: string) => documentsApi.delete(documentId),
    onSuccess: () => {
      queryClient.invalidateQueries({ queryKey: ['documents'] })
    },
  })
  const updateStatusMutation = useMutation({
    mutationFn: ({ documentId, status }: { documentId: string; status: string }) =>
      documentsApi.updateStatus(documentId, status),
    onSuccess: () => {
      queryClient.invalidateQueries({ queryKey: ['documents'] })
    },
  })
  const triggerAutoLabelMutation = useMutation({
    mutationFn: (documentId: string) => documentsApi.triggerAutoLabel(documentId),
    onSuccess: () => {
      queryClient.invalidateQueries({ queryKey: ['documents'] })
    },
  })
  return {
    documents: data?.documents || [],
    total: data?.total || 0,
    limit: data?.limit || params.limit || 20,
    offset: data?.offset || params.offset || 0,
    isLoading,
    error,
    refetch,
    uploadDocument: uploadMutation.mutate,
    uploadDocumentAsync: uploadMutation.mutateAsync,
    isUploading: uploadMutation.isPending,
    batchUpload: batchUploadMutation.mutate,
    batchUploadAsync: batchUploadMutation.mutateAsync,
    isBatchUploading: batchUploadMutation.isPending,
    deleteDocument: deleteMutation.mutate,
    isDeleting: deleteMutation.isPending,
    updateStatus: updateStatusMutation.mutate,
    isUpdatingStatus: updateStatusMutation.isPending,
    triggerAutoLabel: triggerAutoLabelMutation.mutate,
    isTriggeringAutoLabel: triggerAutoLabelMutation.isPending,
  }
 }
--- a/frontend/src/hooks/useTraining.ts
+++ b/frontend/src/hooks/useTraining.ts
@@ -0,0 +1,83 @@
 import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query'
 import { trainingApi } from '../api/endpoints'
 import type { TrainingModelsResponse } from '../api/types'
 export const useTraining = () => {
  const queryClient = useQueryClient()
  const { data: modelsData, isLoading: isLoadingModels } =
    useQuery<TrainingModelsResponse>({
      queryKey: ['training', 'models'],
      queryFn: () => trainingApi.getModels(),
      staleTime: 30000,
    })
  const startTrainingMutation = useMutation({
    mutationFn: (config: {
      name: string
      description?: string
      document_ids: string[]
      epochs?: number
      batch_size?: number
      model_base?: string
    }) => trainingApi.startTraining(config),
    onSuccess: () => {
      queryClient.invalidateQueries({ queryKey: ['training', 'models'] })
    },
  })
  const cancelTaskMutation = useMutation({
    mutationFn: (taskId: string) => trainingApi.cancelTask(taskId),
    onSuccess: () => {
      queryClient.invalidateQueries({ queryKey: ['training', 'models'] })
    },
  })
  const downloadModelMutation = useMutation({
    mutationFn: (taskId: string) => trainingApi.downloadModel(taskId),
    onSuccess: (blob, taskId) => {
      const url = window.URL.createObjectURL(blob)
      const a = document.createElement('a')
      a.href = url
      a.download = `model-${taskId}.pt`
      document.body.appendChild(a)
      a.click()
      window.URL.revokeObjectURL(url)
      document.body.removeChild(a)
    },
  })
  return {
    models: modelsData?.models || [],
    total: modelsData?.total || 0,
    isLoadingModels,
    startTraining: startTrainingMutation.mutate,
    startTrainingAsync: startTrainingMutation.mutateAsync,
    isStartingTraining: startTrainingMutation.isPending,
    cancelTask: cancelTaskMutation.mutate,
    isCancelling: cancelTaskMutation.isPending,
    downloadModel: downloadModelMutation.mutate,
    isDownloading: downloadModelMutation.isPending,
  }
 }
 export const useTrainingDocuments = (params?: {
  has_annotations?: boolean
  min_annotation_count?: number
  exclude_used_in_training?: boolean
  limit?: number
  offset?: number
 }) => {
  const { data, isLoading, error } = useQuery({
    queryKey: ['training', 'documents', params],
    queryFn: () => trainingApi.getDocumentsForTraining(params),
    staleTime: 30000,
  })
  return {
    documents: data?.documents || [],
    total: data?.total || 0,
    isLoading,
    error,
  }
 }
--- a/frontend/src/main.tsx
+++ b/frontend/src/main.tsx
@@ -0,0 +1,23 @@
 import React from 'react'
 import ReactDOM from 'react-dom/client'
 import { QueryClient, QueryClientProvider } from '@tanstack/react-query'
 import App from './App'
 import './styles/index.css'
 const queryClient = new QueryClient({
  defaultOptions: {
    queries: {
      retry: 1,
      refetchOnWindowFocus: false,
      staleTime: 30000,
    },
  },
 })
 ReactDOM.createRoot(document.getElementById('root')!).render(
  <React.StrictMode>
    <QueryClientProvider client={queryClient}>
      <App />
    </QueryClientProvider>
  </React.StrictMode>
 )
--- a/frontend/src/styles/index.css
+++ b/frontend/src/styles/index.css
@@ -0,0 +1,26 @@
@tailwind base;
@tailwind components;
@tailwind utilities;
@layer base {
  body {
    @apply bg-warm-bg text-warm-text-primary;
  }
  /* Custom scrollbar */
  ::-webkit-scrollbar {
    @apply w-2 h-2;
  }
  ::-webkit-scrollbar-track {
    @apply bg-transparent;
  }
  ::-webkit-scrollbar-thumb {
    @apply bg-warm-divider rounded;
  }
  ::-webkit-scrollbar-thumb:hover {
    @apply bg-warm-text-disabled;
  }
 }
--- a/frontend/src/types/index.ts
+++ b/frontend/src/types/index.ts
@@ -0,0 +1,48 @@
 // Legacy types for backward compatibility with old components
 // These will be gradually replaced with API types
 export enum DocumentStatus {
  PENDING = 'Pending',
  LABELED = 'Labeled',
  VERIFIED = 'Verified',
  PARTIAL = 'Partial'
 }
 export interface Document {
  id: string
  name: string
  date: string
  status: DocumentStatus
  exported: boolean
  autoLabelProgress?: number
  autoLabelStatus?: 'Running' | 'Completed' | 'Failed'
 }
 export interface Annotation {
  id: string
  text: string
  label: string
  x: number
  y: number
  width: number
  height: number
  isAuto?: boolean
 }
 export interface TrainingJob {
  id: string
  name: string
  startDate: string
  status: 'Running' | 'Completed' | 'Failed'
  progress: number
  metrics?: {
    accuracy: number
    precision: number
    recall: number
  }
 }
 export interface ModelMetric {
  name: string
  value: number
 }
--- a/frontend/tailwind.config.js
+++ b/frontend/tailwind.config.js
@@ -0,0 +1,47 @@
 export default {
  content: ['./index.html', './src/**/*.{js,ts,jsx,tsx}'],
  theme: {
    extend: {
      fontFamily: {
        sans: ['Inter', 'SF Pro', 'system-ui', 'sans-serif'],
        mono: ['JetBrains Mono', 'SF Mono', 'monospace'],
      },
      colors: {
        warm: {
          bg: '#FAFAF8',
          card: '#FFFFFF',
          hover: '#F1F0ED',
          selected: '#ECEAE6',
          border: '#E6E4E1',
          divider: '#D8D6D2',
          text: {
            primary: '#121212',
            secondary: '#2A2A2A',
            muted: '#6B6B6B',
            disabled: '#9A9A9A',
          },
          state: {
            success: '#3E4A3A',
            error: '#4A3A3A',
            warning: '#4A4A3A',
            info: '#3A3A3A',
          }
        }
      },
      boxShadow: {
        'card': '0 1px 3px rgba(0,0,0,0.08)',
        'modal': '0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06)',
      },
      animation: {
        'fade-in': 'fadeIn 0.3s ease-out',
      },
      keyframes: {
        fadeIn: {
          '0%': { opacity: '0', transform: 'translateY(10px)' },
          '100%': { opacity: '1', transform: 'translateY(0)' },
        }
      }
    }
  },
  plugins: [],
 }
--- a/frontend/tsconfig.json
+++ b/frontend/tsconfig.json
@@ -0,0 +1,29 @@
 {
  "compilerOptions": {
    "target": "ES2022",
    "experimentalDecorators": true,
    "useDefineForClassFields": false,
    "module": "ESNext",
    "lib": [
      "ES2022",
      "DOM",
      "DOM.Iterable"
    ],
    "skipLibCheck": true,
    "types": [
      "node"
    ],
    "moduleResolution": "bundler",
    "isolatedModules": true,
    "moduleDetection": "force",
    "allowJs": true,
    "jsx": "react-jsx",
    "paths": {
      "@/*": [
        "./*"
      ]
    },
    "allowImportingTsExtensions": true,
    "noEmit": true
  }
 }
--- a/frontend/vite.config.ts
+++ b/frontend/vite.config.ts
@@ -0,0 +1,16 @@
 import { defineConfig } from 'vite';
 import react from '@vitejs/plugin-react';
 export default defineConfig({
  server: {
    port: 3000,
    host: '0.0.0.0',
    proxy: {
      '/api': {
        target: 'http://localhost:8000',
        changeOrigin: true,
      },
    },
  },
  plugins: [react()],
 });
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,3 +21,7 @@ pyyaml>=6.0                  # YAML config files
 # Utilities
 tqdm>=4.65.0                 # Progress bars
 python-dotenv>=1.0.0         # Environment variable management
 # Database
 psycopg2-binary>=2.9.0       # PostgreSQL driver
 sqlmodel>=0.0.22             # SQLModel ORM (SQLAlchemy + Pydantic)
--- a/src/cli/analyze_labels.py
+++ b/src/cli/analyze_labels.py
@@ -16,7 +16,7 @@ from pathlib import Path
 from typing import Optional
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
-from config import get_db_connection_string
+from src.config import get_db_connection_string
 from ..normalize import normalize_field
 from ..matcher import FieldMatcher
--- a/src/cli/analyze_report.py
+++ b/src/cli/analyze_report.py
@@ -12,7 +12,7 @@ from collections import defaultdict
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
-from config import get_db_connection_string
+from src.config import get_db_connection_string
 def load_reports_from_db() -> dict:
--- a/src/cli/autolabel.py
+++ b/src/cli/autolabel.py
@@ -34,7 +34,7 @@ if sys.platform == 'win32':
    multiprocessing.set_start_method('spawn', force=True)
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
-from config import get_db_connection_string, PATHS, AUTOLABEL
+from src.config import get_db_connection_string, PATHS, AUTOLABEL
 # Global OCR engine for worker processes (initialized once per worker)
 _worker_ocr_engine = None
--- a/src/cli/import_report_to_db.py
+++ b/src/cli/import_report_to_db.py
@@ -16,7 +16,7 @@ from psycopg2.extras import execute_values
 # Add project root to path
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
-from config import get_db_connection_string, PATHS
+from src.config import get_db_connection_string, PATHS
 def create_tables(conn):
--- a/src/cli/infer.py
+++ b/src/cli/infer.py
@@ -10,6 +10,9 @@ import json
 import sys
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 from src.config import DEFAULT_DPI
 def main():
    parser = argparse.ArgumentParser(
@@ -38,8 +41,8 @@ def main():
    parser.add_argument(
        '--dpi',
        type=int,
-        default=150,
+        default=DEFAULT_DPI,
-        help='DPI for PDF rendering (default: 150, must match training)'
+        help=f'DPI for PDF rendering (default: {DEFAULT_DPI}, must match training)'
    )
    parser.add_argument(
        '--no-fallback',
--- a/src/cli/reprocess_failed.py
+++ b/src/cli/reprocess_failed.py
@@ -17,6 +17,7 @@ from tqdm import tqdm
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 from src.config import DEFAULT_DPI
 from src.data.db import DocumentDB
 from src.data.csv_loader import CSVLoader
 from src.normalize.normalizer import normalize_field
@@ -144,7 +145,7 @@ def process_single_document(args):
                ocr_engine = OCREngine()
                for page_no in range(pdf_doc.page_count):
                    # Render page to image
-                    img = pdf_doc.render_page(page_no, dpi=150)
+                    img = pdf_doc.render_page(page_no, dpi=DEFAULT_DPI)
                    if img is None:
                        continue
--- a/src/cli/serve.py
+++ b/src/cli/serve.py
@@ -15,6 +15,8 @@ from pathlib import Path
 project_root = Path(__file__).parent.parent.parent
 sys.path.insert(0, str(project_root))
 from src.config import DEFAULT_DPI
 def setup_logging(debug: bool = False) -> None:
    """Configure logging."""
@@ -65,8 +67,8 @@ def parse_args() -> argparse.Namespace:
    parser.add_argument(
        "--dpi",
        type=int,
-        default=150,
+        default=DEFAULT_DPI,
-        help="DPI for PDF rendering (must match training DPI)",
+        help=f"DPI for PDF rendering (default: {DEFAULT_DPI}, must match training DPI)",
    )
    parser.add_argument(
--- a/src/cli/train.py
+++ b/src/cli/train.py
@@ -11,7 +11,7 @@ import sys
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
-from config import PATHS
+from src.config import DEFAULT_DPI, PATHS
 def main():
@@ -103,8 +103,8 @@ def main():
    parser.add_argument(
        '--dpi',
        type=int,
-        default=150,
+        default=DEFAULT_DPI,
-        help='DPI used for rendering (default: 150, must match autolabel rendering)'
+        help=f'DPI used for rendering (default: {DEFAULT_DPI}, must match autolabel rendering)'
    )
    parser.add_argument(
        '--export-only',
--- a/src/config.py
+++ b/src/config.py
@@ -8,9 +8,13 @@ from pathlib import Path
 from dotenv import load_dotenv
 # Load environment variables from .env file
-env_path = Path(__file__).parent / '.env'
+# .env is at project root, config.py is in src/
 env_path = Path(__file__).parent.parent / '.env'
 load_dotenv(dotenv_path=env_path)
 # Global DPI setting - must match training DPI for optimal model performance
 DEFAULT_DPI = 150
 def _is_wsl() -> bool:
    """Check if running inside WSL (Windows Subsystem for Linux)."""
@@ -69,7 +73,7 @@ else:
 # Auto-labeling Configuration
 AUTOLABEL = {
    'workers': 2,
-    'dpi': 150,
+    'dpi': DEFAULT_DPI,
    'min_confidence': 0.5,
    'train_ratio': 0.8,
    'val_ratio': 0.1,
--- a/src/data/admin_db.py
+++ b/src/data/admin_db.py
--- a/src/data/admin_models.py
+++ b/src/data/admin_models.py
@@ -0,0 +1,339 @@
 """
 Admin API SQLModel Database Models
 Defines the database schema for admin document management, annotations, and training tasks.
 Includes batch upload support, training document links, and annotation history.
 """
 from datetime import datetime
 from typing import Any
 from uuid import UUID, uuid4
 from sqlmodel import Field, SQLModel, Column, JSON
 # =============================================================================
 # CSV to Field Class Mapping
 # =============================================================================
 CSV_TO_CLASS_MAPPING: dict[str, int] = {
    "InvoiceNumber": 0,  # invoice_number
    "InvoiceDate": 1,  # invoice_date
    "InvoiceDueDate": 2,  # invoice_due_date
    "OCR": 3,  # ocr_number
    "Bankgiro": 4,  # bankgiro
    "Plusgiro": 5,  # plusgiro
    "Amount": 6,  # amount
    "supplier_organisation_number": 7,  # supplier_organisation_number
    # 8: payment_line (derived from OCR/Bankgiro/Amount)
    "customer_number": 9,  # customer_number
 }
 # =============================================================================
 # Core Models
 # =============================================================================
 class AdminToken(SQLModel, table=True):
    """Admin authentication token."""
    __tablename__ = "admin_tokens"
    token: str = Field(primary_key=True, max_length=255)
    name: str = Field(max_length=255)
    is_active: bool = Field(default=True)
    created_at: datetime = Field(default_factory=datetime.utcnow)
    last_used_at: datetime | None = Field(default=None)
    expires_at: datetime | None = Field(default=None)
 class AdminDocument(SQLModel, table=True):
    """Document uploaded for labeling/annotation."""
    __tablename__ = "admin_documents"
    document_id: UUID = Field(default_factory=uuid4, primary_key=True)
    admin_token: str | None = Field(default=None, foreign_key="admin_tokens.token", max_length=255, index=True)
    filename: str = Field(max_length=255)
    file_size: int
    content_type: str = Field(max_length=100)
    file_path: str = Field(max_length=512)  # Path to stored file
    page_count: int = Field(default=1)
    status: str = Field(default="pending", max_length=20, index=True)
    # Status: pending, auto_labeling, labeled, exported
    auto_label_status: str | None = Field(default=None, max_length=20)
    # Auto-label status: running, completed, failed
    auto_label_error: str | None = Field(default=None)
    # v2: Upload source tracking
    upload_source: str = Field(default="ui", max_length=20)
    # Upload source: ui, api
    batch_id: UUID | None = Field(default=None, index=True)
    # Link to batch upload (if uploaded via ZIP)
    csv_field_values: dict[str, Any] | None = Field(default=None, sa_column=Column(JSON))
    # Original CSV values for reference
    auto_label_queued_at: datetime | None = Field(default=None)
    # When auto-label was queued
    annotation_lock_until: datetime | None = Field(default=None)
    # Lock for manual annotation while auto-label runs
    created_at: datetime = Field(default_factory=datetime.utcnow)
    updated_at: datetime = Field(default_factory=datetime.utcnow)
 class AdminAnnotation(SQLModel, table=True):
    """Annotation for a document (bounding box + label)."""
    __tablename__ = "admin_annotations"
    annotation_id: UUID = Field(default_factory=uuid4, primary_key=True)
    document_id: UUID = Field(foreign_key="admin_documents.document_id", index=True)
    page_number: int = Field(default=1)  # 1-indexed
    class_id: int  # 0-9 for invoice fields
    class_name: str = Field(max_length=50)  # e.g., "invoice_number"
    # Bounding box (normalized 0-1 coordinates)
    x_center: float
    y_center: float
    width: float
    height: float
    # Original pixel coordinates (for display)
    bbox_x: int
    bbox_y: int
    bbox_width: int
    bbox_height: int
    # OCR extracted text (if available)
    text_value: str | None = Field(default=None)
    confidence: float | None = Field(default=None)
    # Source: manual, auto, imported
    source: str = Field(default="manual", max_length=20, index=True)
    # v2: Verification fields
    is_verified: bool = Field(default=False, index=True)
    verified_at: datetime | None = Field(default=None)
    verified_by: str | None = Field(default=None, max_length=255)
    # v2: Override tracking
    override_source: str | None = Field(default=None, max_length=20)
    # If this annotation overrides another: 'auto' or 'imported'
    original_annotation_id: UUID | None = Field(default=None)
    # Reference to the annotation this overrides
    created_at: datetime = Field(default_factory=datetime.utcnow)
    updated_at: datetime = Field(default_factory=datetime.utcnow)
 class TrainingTask(SQLModel, table=True):
    """Training/fine-tuning task."""
    __tablename__ = "training_tasks"
    task_id: UUID = Field(default_factory=uuid4, primary_key=True)
    admin_token: str = Field(foreign_key="admin_tokens.token", max_length=255, index=True)
    name: str = Field(max_length=255)
    description: str | None = Field(default=None)
    status: str = Field(default="pending", max_length=20, index=True)
    # Status: pending, scheduled, running, completed, failed, cancelled
    task_type: str = Field(default="train", max_length=20)
    # Task type: train, finetune
    # Training configuration
    config: dict[str, Any] | None = Field(default=None, sa_column=Column(JSON))
    # Schedule settings
    scheduled_at: datetime | None = Field(default=None)
    cron_expression: str | None = Field(default=None, max_length=50)
    is_recurring: bool = Field(default=False)
    # Execution details
    started_at: datetime | None = Field(default=None)
    completed_at: datetime | None = Field(default=None)
    error_message: str | None = Field(default=None)
    # Result metrics
    result_metrics: dict[str, Any] | None = Field(default=None, sa_column=Column(JSON))
    model_path: str | None = Field(default=None, max_length=512)
    # v2: Document count and extracted metrics
    document_count: int = Field(default=0)
    # Count of documents used in training
    metrics_mAP: float | None = Field(default=None, index=True)
    metrics_precision: float | None = Field(default=None)
    metrics_recall: float | None = Field(default=None)
    # Extracted metrics for easy querying
    created_at: datetime = Field(default_factory=datetime.utcnow)
    updated_at: datetime = Field(default_factory=datetime.utcnow)
 class TrainingLog(SQLModel, table=True):
    """Training log entry."""
    __tablename__ = "training_logs"
    log_id: int | None = Field(default=None, primary_key=True)
    task_id: UUID = Field(foreign_key="training_tasks.task_id", index=True)
    level: str = Field(max_length=20)  # INFO, WARNING, ERROR
    message: str
    details: dict[str, Any] | None = Field(default=None, sa_column=Column(JSON))
    created_at: datetime = Field(default_factory=datetime.utcnow, index=True)
 # =============================================================================
 # Batch Upload Models (v2)
 # =============================================================================
 class BatchUpload(SQLModel, table=True):
    """Batch upload of multiple documents via ZIP file."""
    __tablename__ = "batch_uploads"
    batch_id: UUID = Field(default_factory=uuid4, primary_key=True)
    admin_token: str = Field(foreign_key="admin_tokens.token", max_length=255, index=True)
    filename: str = Field(max_length=255)  # ZIP filename
    file_size: int
    upload_source: str = Field(default="ui", max_length=20)
    # Upload source: ui, api
    status: str = Field(default="processing", max_length=20, index=True)
    # Status: processing, completed, partial, failed
    total_files: int = Field(default=0)
    processed_files: int = Field(default=0)
    # Number of files processed so far
    successful_files: int = Field(default=0)
    failed_files: int = Field(default=0)
    csv_filename: str | None = Field(default=None, max_length=255)
    # CSV file used for auto-labeling
    csv_row_count: int | None = Field(default=None)
    error_message: str | None = Field(default=None)
    created_at: datetime = Field(default_factory=datetime.utcnow)
    completed_at: datetime | None = Field(default=None)
 class BatchUploadFile(SQLModel, table=True):
    """Individual file within a batch upload."""
    __tablename__ = "batch_upload_files"
    file_id: UUID = Field(default_factory=uuid4, primary_key=True)
    batch_id: UUID = Field(foreign_key="batch_uploads.batch_id", index=True)
    filename: str = Field(max_length=255)  # PDF filename within ZIP
    document_id: UUID | None = Field(default=None)
    # Link to created AdminDocument (if successful)
    status: str = Field(default="pending", max_length=20, index=True)
    # Status: pending, processing, completed, failed, skipped
    error_message: str | None = Field(default=None)
    annotation_count: int = Field(default=0)
    # Number of annotations created for this file
    csv_row_data: dict[str, Any] | None = Field(default=None, sa_column=Column(JSON))
    # CSV row data for this file (if available)
    created_at: datetime = Field(default_factory=datetime.utcnow)
    processed_at: datetime | None = Field(default=None)
 # =============================================================================
 # Training Document Link (v2)
 # =============================================================================
 class TrainingDocumentLink(SQLModel, table=True):
    """Junction table linking training tasks to documents."""
    __tablename__ = "training_document_links"
    link_id: UUID = Field(default_factory=uuid4, primary_key=True)
    task_id: UUID = Field(foreign_key="training_tasks.task_id", index=True)
    document_id: UUID = Field(foreign_key="admin_documents.document_id", index=True)
    annotation_snapshot: dict[str, Any] | None = Field(default=None, sa_column=Column(JSON))
    # Snapshot of annotations at training time (includes count, verified count, etc.)
    created_at: datetime = Field(default_factory=datetime.utcnow)
 # =============================================================================
 # Annotation History (v2)
 # =============================================================================
 class AnnotationHistory(SQLModel, table=True):
    """History of annotation changes (for override tracking)."""
    __tablename__ = "annotation_history"
    history_id: UUID = Field(default_factory=uuid4, primary_key=True)
    annotation_id: UUID = Field(foreign_key="admin_annotations.annotation_id", index=True)
    document_id: UUID = Field(foreign_key="admin_documents.document_id", index=True)
    # Change action: created, updated, deleted, override
    action: str = Field(max_length=20, index=True)
    # Previous value (for updates/deletes)
    previous_value: dict[str, Any] | None = Field(default=None, sa_column=Column(JSON))
    # New value (for creates/updates)
    new_value: dict[str, Any] | None = Field(default=None, sa_column=Column(JSON))
    # Change metadata
    changed_by: str | None = Field(default=None, max_length=255)
    # User/token who made the change
    change_reason: str | None = Field(default=None)
    # Optional reason for change
    created_at: datetime = Field(default_factory=datetime.utcnow, index=True)
 # Field class mapping (same as src/cli/train.py)
 FIELD_CLASSES = {
    0: "invoice_number",
    1: "invoice_date",
    2: "invoice_due_date",
    3: "ocr_number",
    4: "bankgiro",
    5: "plusgiro",
    6: "amount",
    7: "supplier_organisation_number",
    8: "payment_line",
    9: "customer_number",
 }
 FIELD_CLASS_IDS = {v: k for k, v in FIELD_CLASSES.items()}
 # Read-only models for API responses
 class AdminDocumentRead(SQLModel):
    """Admin document response model."""
    document_id: UUID
    filename: str
    file_size: int
    content_type: str
    page_count: int
    status: str
    auto_label_status: str | None
    auto_label_error: str | None
    created_at: datetime
    updated_at: datetime
 class AdminAnnotationRead(SQLModel):
    """Admin annotation response model."""
    annotation_id: UUID
    document_id: UUID
    page_number: int
    class_id: int
    class_name: str
    x_center: float
    y_center: float
    width: float
    height: float
    bbox_x: int
    bbox_y: int
    bbox_width: int
    bbox_height: int
    text_value: str | None
    confidence: float | None
    source: str
    created_at: datetime
 class TrainingTaskRead(SQLModel):
    """Training task response model."""
    task_id: UUID
    name: str
    description: str | None
    status: str
    task_type: str
    config: dict[str, Any] | None
    scheduled_at: datetime | None
    is_recurring: bool
    started_at: datetime | None
    completed_at: datetime | None
    error_message: str | None
    result_metrics: dict[str, Any] | None
    model_path: str | None
    created_at: datetime
--- a/src/data/async_request_db.py
+++ b/src/data/async_request_db.py
@@ -0,0 +1,374 @@
 """
 Async Request Database Operations
 Database interface for async invoice processing requests using SQLModel.
 """
 import logging
 from datetime import datetime, timedelta
 from typing import Any
 from uuid import UUID
 from sqlalchemy import func, text
 from sqlmodel import Session, select
 from src.data.database import get_session_context, create_db_and_tables, close_engine
 from src.data.models import ApiKey, AsyncRequest, RateLimitEvent
 logger = logging.getLogger(__name__)
 # Legacy dataclasses for backward compatibility
 from dataclasses import dataclass
@dataclass(frozen=True)
 class ApiKeyConfig:
    """API key configuration and limits (legacy compatibility)."""
    api_key: str
    name: str
    is_active: bool
    requests_per_minute: int
    max_concurrent_jobs: int
    max_file_size_mb: int
 class AsyncRequestDB:
    """Database interface for async processing requests using SQLModel."""
    def __init__(self, connection_string: str | None = None) -> None:
        # connection_string is kept for backward compatibility but ignored
        # SQLModel uses the global engine from database.py
        self._initialized = False
    def connect(self):
        """Legacy method - returns self for compatibility."""
        return self
    def close(self) -> None:
        """Close database connections."""
        close_engine()
    def __enter__(self) -> "AsyncRequestDB":
        return self
    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
        pass  # Sessions are managed per-operation
    def create_tables(self) -> None:
        """Create async processing tables if they don't exist."""
        create_db_and_tables()
        self._initialized = True
    # ==========================================================================
    # API Key Operations
    # ==========================================================================
    def is_valid_api_key(self, api_key: str) -> bool:
        """Check if API key exists and is active."""
        with get_session_context() as session:
            result = session.get(ApiKey, api_key)
            return result is not None and result.is_active is True
    def get_api_key_config(self, api_key: str) -> ApiKeyConfig | None:
        """Get API key configuration and limits."""
        with get_session_context() as session:
            result = session.get(ApiKey, api_key)
            if result is None:
                return None
            return ApiKeyConfig(
                api_key=result.api_key,
                name=result.name,
                is_active=result.is_active,
                requests_per_minute=result.requests_per_minute,
                max_concurrent_jobs=result.max_concurrent_jobs,
                max_file_size_mb=result.max_file_size_mb,
            )
    def create_api_key(
        self,
        api_key: str,
        name: str,
        requests_per_minute: int = 10,
        max_concurrent_jobs: int = 3,
        max_file_size_mb: int = 50,
    ) -> None:
        """Create a new API key."""
        with get_session_context() as session:
            existing = session.get(ApiKey, api_key)
            if existing:
                existing.name = name
                existing.requests_per_minute = requests_per_minute
                existing.max_concurrent_jobs = max_concurrent_jobs
                existing.max_file_size_mb = max_file_size_mb
                session.add(existing)
            else:
                new_key = ApiKey(
                    api_key=api_key,
                    name=name,
                    requests_per_minute=requests_per_minute,
                    max_concurrent_jobs=max_concurrent_jobs,
                    max_file_size_mb=max_file_size_mb,
                )
                session.add(new_key)
    def update_api_key_usage(self, api_key: str) -> None:
        """Update API key last used timestamp and increment total requests."""
        with get_session_context() as session:
            key = session.get(ApiKey, api_key)
            if key:
                key.last_used_at = datetime.utcnow()
                key.total_requests += 1
                session.add(key)
    # ==========================================================================
    # Async Request Operations
    # ==========================================================================
    def create_request(
        self,
        api_key: str,
        filename: str,
        file_size: int,
        content_type: str,
        expires_at: datetime,
        request_id: str | None = None,
    ) -> str:
        """Create a new async request."""
        with get_session_context() as session:
            request = AsyncRequest(
                api_key=api_key,
                filename=filename,
                file_size=file_size,
                content_type=content_type,
                expires_at=expires_at,
            )
            if request_id:
                request.request_id = UUID(request_id)
            session.add(request)
            session.flush()  # To get the generated ID
            return str(request.request_id)
    def get_request(self, request_id: str) -> AsyncRequest | None:
        """Get a single async request by ID."""
        with get_session_context() as session:
            result = session.get(AsyncRequest, UUID(request_id))
            if result:
                # Detach from session for use outside context
                session.expunge(result)
            return result
    def get_request_by_api_key(
        self,
        request_id: str,
        api_key: str,
    ) -> AsyncRequest | None:
        """Get a request only if it belongs to the given API key."""
        with get_session_context() as session:
            statement = select(AsyncRequest).where(
                AsyncRequest.request_id == UUID(request_id),
                AsyncRequest.api_key == api_key,
            )
            result = session.exec(statement).first()
            if result:
                session.expunge(result)
            return result
    def update_status(
        self,
        request_id: str,
        status: str,
        error_message: str | None = None,
        increment_retry: bool = False,
    ) -> None:
        """Update request status."""
        with get_session_context() as session:
            request = session.get(AsyncRequest, UUID(request_id))
            if request:
                request.status = status
                if status == "processing":
                    request.started_at = datetime.utcnow()
                if error_message is not None:
                    request.error_message = error_message
                if increment_retry:
                    request.retry_count += 1
                session.add(request)
    def complete_request(
        self,
        request_id: str,
        document_id: str,
        result: dict[str, Any],
        processing_time_ms: float,
        visualization_path: str | None = None,
    ) -> None:
        """Mark request as completed with result."""
        with get_session_context() as session:
            request = session.get(AsyncRequest, UUID(request_id))
            if request:
                request.status = "completed"
                request.document_id = document_id
                request.result = result
                request.processing_time_ms = processing_time_ms
                request.visualization_path = visualization_path
                request.completed_at = datetime.utcnow()
                session.add(request)
    def get_requests_by_api_key(
        self,
        api_key: str,
        status: str | None = None,
        limit: int = 20,
        offset: int = 0,
    ) -> tuple[list[AsyncRequest], int]:
        """Get paginated requests for an API key."""
        with get_session_context() as session:
            # Count query
            count_stmt = select(func.count()).select_from(AsyncRequest).where(
                AsyncRequest.api_key == api_key
            )
            if status:
                count_stmt = count_stmt.where(AsyncRequest.status == status)
            total = session.exec(count_stmt).one()
            # Fetch query
            statement = select(AsyncRequest).where(
                AsyncRequest.api_key == api_key
            )
            if status:
                statement = statement.where(AsyncRequest.status == status)
            statement = statement.order_by(AsyncRequest.created_at.desc())
            statement = statement.offset(offset).limit(limit)
            results = session.exec(statement).all()
            # Detach results from session
            for r in results:
                session.expunge(r)
            return list(results), total
    def count_active_jobs(self, api_key: str) -> int:
        """Count active (pending + processing) jobs for an API key."""
        with get_session_context() as session:
            statement = select(func.count()).select_from(AsyncRequest).where(
                AsyncRequest.api_key == api_key,
                AsyncRequest.status.in_(["pending", "processing"]),
            )
            return session.exec(statement).one()
    def get_pending_requests(self, limit: int = 10) -> list[AsyncRequest]:
        """Get pending requests ordered by creation time."""
        with get_session_context() as session:
            statement = select(AsyncRequest).where(
                AsyncRequest.status == "pending"
            ).order_by(AsyncRequest.created_at).limit(limit)
            results = session.exec(statement).all()
            for r in results:
                session.expunge(r)
            return list(results)
    def get_queue_position(self, request_id: str) -> int | None:
        """Get position of a request in the pending queue."""
        with get_session_context() as session:
            # Get the request's created_at
            request = session.get(AsyncRequest, UUID(request_id))
            if not request:
                return None
            # Count pending requests created before this one
            statement = select(func.count()).select_from(AsyncRequest).where(
                AsyncRequest.status == "pending",
                AsyncRequest.created_at < request.created_at,
            )
            count = session.exec(statement).one()
            return count + 1  # 1-based position
    # ==========================================================================
    # Rate Limit Operations
    # ==========================================================================
    def record_rate_limit_event(self, api_key: str, event_type: str) -> None:
        """Record a rate limit event."""
        with get_session_context() as session:
            event = RateLimitEvent(
                api_key=api_key,
                event_type=event_type,
            )
            session.add(event)
    def count_recent_requests(self, api_key: str, seconds: int = 60) -> int:
        """Count requests in the last N seconds."""
        with get_session_context() as session:
            cutoff = datetime.utcnow() - timedelta(seconds=seconds)
            statement = select(func.count()).select_from(RateLimitEvent).where(
                RateLimitEvent.api_key == api_key,
                RateLimitEvent.event_type == "request",
                RateLimitEvent.created_at > cutoff,
            )
            return session.exec(statement).one()
    # ==========================================================================
    # Cleanup Operations
    # ==========================================================================
    def delete_expired_requests(self) -> int:
        """Delete requests that have expired. Returns count of deleted rows."""
        with get_session_context() as session:
            now = datetime.utcnow()
            statement = select(AsyncRequest).where(AsyncRequest.expires_at < now)
            expired = session.exec(statement).all()
            count = len(expired)
            for request in expired:
                session.delete(request)
            logger.info(f"Deleted {count} expired async requests")
            return count
    def cleanup_old_rate_limit_events(self, hours: int = 1) -> int:
        """Delete rate limit events older than N hours."""
        with get_session_context() as session:
            cutoff = datetime.utcnow() - timedelta(hours=hours)
            statement = select(RateLimitEvent).where(
                RateLimitEvent.created_at < cutoff
            )
            old_events = session.exec(statement).all()
            count = len(old_events)
            for event in old_events:
                session.delete(event)
            return count
    def reset_stale_processing_requests(
        self,
        stale_minutes: int = 10,
        max_retries: int = 3,
    ) -> int:
        """
        Reset requests stuck in 'processing' status.
        Requests that have been processing for more than stale_minutes
        are considered stale. They are either reset to 'pending' (if under
        max_retries) or set to 'failed'.
        """
        with get_session_context() as session:
            cutoff = datetime.utcnow() - timedelta(minutes=stale_minutes)
            reset_count = 0
            # Find stale processing requests
            statement = select(AsyncRequest).where(
                AsyncRequest.status == "processing",
                AsyncRequest.started_at < cutoff,
            )
            stale_requests = session.exec(statement).all()
            for request in stale_requests:
                if request.retry_count < max_retries:
                    request.status = "pending"
                    request.started_at = None
                else:
                    request.status = "failed"
                    request.error_message = "Processing timeout after max retries"
                session.add(request)
                reset_count += 1
            if reset_count > 0:
                logger.warning(f"Reset {reset_count} stale processing requests")
            return reset_count
--- a/src/data/database.py
+++ b/src/data/database.py
@@ -0,0 +1,103 @@
 """
 Database Engine and Session Management
 Provides SQLModel database engine and session handling.
 """
 import logging
 from contextlib import contextmanager
 from pathlib import Path
 from typing import Generator
 from sqlalchemy import text
 from sqlmodel import Session, SQLModel, create_engine
 import sys
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 from src.config import get_db_connection_string
 logger = logging.getLogger(__name__)
 # Global engine instance
 _engine = None
 def get_engine():
    """Get or create the database engine."""
    global _engine
    if _engine is None:
        connection_string = get_db_connection_string()
        # Convert psycopg2 format to SQLAlchemy format
        if connection_string.startswith("postgresql://"):
            # Already in correct format
            pass
        elif "host=" in connection_string:
            # Convert DSN format to URL format
            parts = dict(item.split("=") for item in connection_string.split())
            connection_string = (
                f"postgresql://{parts.get('user', '')}:{parts.get('password', '')}"
                f"@{parts.get('host', 'localhost')}:{parts.get('port', '5432')}"
                f"/{parts.get('dbname', 'docmaster')}"
            )
        _engine = create_engine(
            connection_string,
            echo=False,  # Set to True for SQL debugging
            pool_pre_ping=True,  # Verify connections before use
            pool_size=5,
            max_overflow=10,
        )
    return _engine
 def create_db_and_tables() -> None:
    """Create all database tables."""
    from src.data.models import ApiKey, AsyncRequest, RateLimitEvent  # noqa: F401
    from src.data.admin_models import (  # noqa: F401
        AdminToken,
        AdminDocument,
        AdminAnnotation,
        TrainingTask,
        TrainingLog,
    )
    engine = get_engine()
    SQLModel.metadata.create_all(engine)
    logger.info("Database tables created/verified")
 def get_session() -> Session:
    """Get a new database session."""
    engine = get_engine()
    return Session(engine)
@contextmanager
 def get_session_context() -> Generator[Session, None, None]:
    """Context manager for database sessions with auto-commit/rollback."""
    session = get_session()
    try:
        yield session
        session.commit()
    except Exception:
        session.rollback()
        raise
    finally:
        session.close()
 def close_engine() -> None:
    """Close the database engine and release connections."""
    global _engine
    if _engine is not None:
        _engine.dispose()
        _engine = None
        logger.info("Database engine closed")
 def execute_raw_sql(sql: str) -> None:
    """Execute raw SQL (for migrations)."""
    engine = get_engine()
    with engine.connect() as conn:
        conn.execute(text(sql))
        conn.commit()
--- a/src/data/db.py
+++ b/src/data/db.py
@@ -10,7 +10,7 @@ import sys
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
-from config import get_db_connection_string
+from src.config import get_db_connection_string
 class DocumentDB:
--- a/src/data/migrations/001_async_tables.sql
+++ b/src/data/migrations/001_async_tables.sql
@@ -0,0 +1,83 @@
 -- Async Invoice Processing Tables
 -- Migration: 001_async_tables.sql
 -- Created: 2024-01-15
 -- API Keys table for authentication and rate limiting
 CREATE TABLE IF NOT EXISTS api_keys (
    api_key TEXT PRIMARY KEY,
    name TEXT NOT NULL,
    is_active BOOLEAN DEFAULT true,
    -- Rate limits
    requests_per_minute INTEGER DEFAULT 10,
    max_concurrent_jobs INTEGER DEFAULT 3,
    max_file_size_mb INTEGER DEFAULT 50,
    -- Usage tracking
    total_requests INTEGER DEFAULT 0,
    total_processed INTEGER DEFAULT 0,
    -- Timestamps
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    last_used_at TIMESTAMPTZ
 );
 -- Async processing requests table
 CREATE TABLE IF NOT EXISTS async_requests (
    request_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    api_key TEXT NOT NULL REFERENCES api_keys(api_key) ON DELETE CASCADE,
    status TEXT NOT NULL DEFAULT 'pending',
    filename TEXT NOT NULL,
    file_size INTEGER NOT NULL,
    content_type TEXT NOT NULL,
    -- Processing metadata
    document_id TEXT,
    error_message TEXT,
    retry_count INTEGER DEFAULT 0,
    -- Timestamps
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    started_at TIMESTAMPTZ,
    completed_at TIMESTAMPTZ,
    expires_at TIMESTAMPTZ NOT NULL,
    -- Result storage (JSONB for flexibility)
    result JSONB,
    -- Processing time
    processing_time_ms REAL,
    -- Visualization path
    visualization_path TEXT,
    CONSTRAINT valid_status CHECK (status IN ('pending', 'processing', 'completed', 'failed'))
 );
 -- Indexes for async_requests
 CREATE INDEX IF NOT EXISTS idx_async_requests_api_key ON async_requests(api_key);
 CREATE INDEX IF NOT EXISTS idx_async_requests_status ON async_requests(status);
 CREATE INDEX IF NOT EXISTS idx_async_requests_created_at ON async_requests(created_at);
 CREATE INDEX IF NOT EXISTS idx_async_requests_expires_at ON async_requests(expires_at);
 CREATE INDEX IF NOT EXISTS idx_async_requests_api_key_status ON async_requests(api_key, status);
 -- Rate limit tracking table
 CREATE TABLE IF NOT EXISTS rate_limit_events (
    id SERIAL PRIMARY KEY,
    api_key TEXT NOT NULL REFERENCES api_keys(api_key) ON DELETE CASCADE,
    event_type TEXT NOT NULL,  -- 'request', 'complete', 'fail'
    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
 );
 -- Index for rate limiting queries (recent events only)
 CREATE INDEX IF NOT EXISTS idx_rate_limit_events_api_key_time
 ON rate_limit_events(api_key, created_at DESC);
 -- Cleanup old rate limit events index
 CREATE INDEX IF NOT EXISTS idx_rate_limit_events_cleanup
 ON rate_limit_events(created_at);
 -- Insert default API key for development/testing
 INSERT INTO api_keys (api_key, name, requests_per_minute, max_concurrent_jobs)
 VALUES ('dev-api-key-12345', 'Development Key', 100, 10)
 ON CONFLICT (api_key) DO NOTHING;
--- a/src/data/migrations/002_nullable_admin_token.sql
+++ b/src/data/migrations/002_nullable_admin_token.sql
@@ -0,0 +1,5 @@
 -- Migration: Make admin_token nullable in admin_documents table
 -- This allows documents uploaded via public API to not require an admin token
 ALTER TABLE admin_documents
 ALTER COLUMN admin_token DROP NOT NULL;
--- a/src/data/models.py
+++ b/src/data/models.py
@@ -0,0 +1,95 @@
 """
 SQLModel Database Models
 Defines the database schema using SQLModel (SQLAlchemy + Pydantic).
 """
 from datetime import datetime
 from typing import Any
 from uuid import UUID, uuid4
 from sqlmodel import Field, SQLModel, Column, JSON
 class ApiKey(SQLModel, table=True):
    """API key configuration and limits."""
    __tablename__ = "api_keys"
    api_key: str = Field(primary_key=True, max_length=255)
    name: str = Field(max_length=255)
    is_active: bool = Field(default=True)
    requests_per_minute: int = Field(default=10)
    max_concurrent_jobs: int = Field(default=3)
    max_file_size_mb: int = Field(default=50)
    total_requests: int = Field(default=0)
    total_processed: int = Field(default=0)
    created_at: datetime = Field(default_factory=datetime.utcnow)
    last_used_at: datetime | None = Field(default=None)
 class AsyncRequest(SQLModel, table=True):
    """Async request record."""
    __tablename__ = "async_requests"
    request_id: UUID = Field(default_factory=uuid4, primary_key=True)
    api_key: str = Field(foreign_key="api_keys.api_key", max_length=255, index=True)
    status: str = Field(default="pending", max_length=20, index=True)
    filename: str = Field(max_length=255)
    file_size: int
    content_type: str = Field(max_length=100)
    document_id: str | None = Field(default=None, max_length=100)
    error_message: str | None = Field(default=None)
    retry_count: int = Field(default=0)
    created_at: datetime = Field(default_factory=datetime.utcnow)
    started_at: datetime | None = Field(default=None)
    completed_at: datetime | None = Field(default=None)
    expires_at: datetime = Field(index=True)
    result: dict[str, Any] | None = Field(default=None, sa_column=Column(JSON))
    processing_time_ms: float | None = Field(default=None)
    visualization_path: str | None = Field(default=None, max_length=255)
 class RateLimitEvent(SQLModel, table=True):
    """Rate limit event record."""
    __tablename__ = "rate_limit_events"
    id: int | None = Field(default=None, primary_key=True)
    api_key: str = Field(foreign_key="api_keys.api_key", max_length=255, index=True)
    event_type: str = Field(max_length=50)
    created_at: datetime = Field(default_factory=datetime.utcnow, index=True)
 # Read-only models for responses (without table=True)
 class ApiKeyRead(SQLModel):
    """API key response model (read-only)."""
    api_key: str
    name: str
    is_active: bool
    requests_per_minute: int
    max_concurrent_jobs: int
    max_file_size_mb: int
 class AsyncRequestRead(SQLModel):
    """Async request response model (read-only)."""
    request_id: UUID
    api_key: str
    status: str
    filename: str
    file_size: int
    content_type: str
    document_id: str | None
    error_message: str | None
    retry_count: int
    created_at: datetime
    started_at: datetime | None
    completed_at: datetime | None
    expires_at: datetime
    result: dict[str, Any] | None
    processing_time_ms: float | None
    visualization_path: str | None
--- a/src/processing/autolabel_tasks.py
+++ b/src/processing/autolabel_tasks.py
@@ -12,6 +12,8 @@ import warnings
 from pathlib import Path
 from typing import Any, Dict, Optional
 from src.config import DEFAULT_DPI
 # Global OCR instance (initialized once per GPU worker process)
 _ocr_engine: Optional[Any] = None
@@ -94,7 +96,7 @@ def process_text_pdf(task_data: Dict[str, Any]) -> Dict[str, Any]:
    row_dict = task_data["row_dict"]
    pdf_path = Path(task_data["pdf_path"])
    output_dir = Path(task_data["output_dir"])
-    dpi = task_data.get("dpi", 150)
+    dpi = task_data.get("dpi", DEFAULT_DPI)
    min_confidence = task_data.get("min_confidence", 0.5)
    start_time = time.time()
@@ -212,7 +214,7 @@ def process_scanned_pdf(task_data: Dict[str, Any]) -> Dict[str, Any]:
    row_dict = task_data["row_dict"]
    pdf_path = Path(task_data["pdf_path"])
    output_dir = Path(task_data["output_dir"])
-    dpi = task_data.get("dpi", 150)
+    dpi = task_data.get("dpi", DEFAULT_DPI)
    min_confidence = task_data.get("min_confidence", 0.5)
    start_time = time.time()
--- a/src/validation/llm_validator.py
+++ b/src/validation/llm_validator.py
@@ -16,6 +16,8 @@ from datetime import datetime
 import psycopg2
 from psycopg2.extras import execute_values
 from src.config import DEFAULT_DPI
@dataclass
 class LLMExtractionResult:
@@ -265,7 +267,7 @@ Return ONLY the JSON object, no other text."""
        self,
        pdf_path: Path,
        page_no: int = 0,
-        dpi: int = 150,
+        dpi: int = DEFAULT_DPI,
        max_size_mb: float = 18.0
    ) -> bytes:
        """
--- a/src/web/admin_routes_new.py
+++ b/src/web/admin_routes_new.py
@@ -0,0 +1,8 @@
 """
 Backward compatibility shim for admin_routes.py
 DEPRECATED: Import from src.web.api.v1.admin.documents instead.
 """
 from src.web.api.v1.admin.documents import *
 __all__ = ["create_admin_router"]
--- a/src/web/api/init.py
+++ b/src/web/api/init.py
--- a/src/web/api/v1/init.py
+++ b/src/web/api/v1/init.py
--- a/src/web/api/v1/admin/init.py
+++ b/src/web/api/v1/admin/init.py
@@ -0,0 +1,19 @@
 """
 Admin API v1
 Document management, annotations, and training endpoints.
 """
 from src.web.api.v1.admin.annotations import create_annotation_router
 from src.web.api.v1.admin.auth import create_auth_router
 from src.web.api.v1.admin.documents import create_documents_router
 from src.web.api.v1.admin.locks import create_locks_router
 from src.web.api.v1.admin.training import create_training_router
 __all__ = [
    "create_annotation_router",
    "create_auth_router",
    "create_documents_router",
    "create_locks_router",
    "create_training_router",
 ]
--- a/src/web/api/v1/admin/annotations.py
+++ b/src/web/api/v1/admin/annotations.py
@@ -0,0 +1,644 @@
 """
 Admin Annotation API Routes
 FastAPI endpoints for annotation management.
 """
 import logging
 from pathlib import Path
 from typing import Annotated
 from uuid import UUID
 from fastapi import APIRouter, HTTPException, Query
 from fastapi.responses import FileResponse
 from src.data.admin_db import AdminDB
 from src.data.admin_models import FIELD_CLASSES, FIELD_CLASS_IDS
 from src.web.core.auth import AdminTokenDep, AdminDBDep
 from src.web.services.autolabel import get_auto_label_service
 from src.web.schemas.admin import (
    AnnotationCreate,
    AnnotationItem,
    AnnotationListResponse,
    AnnotationOverrideRequest,
    AnnotationOverrideResponse,
    AnnotationResponse,
    AnnotationSource,
    AnnotationUpdate,
    AnnotationVerifyRequest,
    AnnotationVerifyResponse,
    AutoLabelRequest,
    AutoLabelResponse,
    BoundingBox,
 )
 from src.web.schemas.common import ErrorResponse
 logger = logging.getLogger(__name__)
 # Image storage directory
 ADMIN_IMAGES_DIR = Path("data/admin_images")
 def _validate_uuid(value: str, name: str = "ID") -> None:
    """Validate UUID format."""
    try:
        UUID(value)
    except ValueError:
        raise HTTPException(
            status_code=400,
            detail=f"Invalid {name} format. Must be a valid UUID.",
        )
 def create_annotation_router() -> APIRouter:
    """Create annotation API router."""
    router = APIRouter(prefix="/admin/documents", tags=["Admin Annotations"])
    # =========================================================================
    # Image Endpoints
    # =========================================================================
    @router.get(
        "/{document_id}/images/{page_number}",
        responses={
            401: {"model": ErrorResponse, "description": "Invalid token"},
            404: {"model": ErrorResponse, "description": "Not found"},
        },
        summary="Get page image",
        description="Get the image for a specific page.",
    )
    async def get_page_image(
        document_id: str,
        page_number: int,
        admin_token: AdminTokenDep,
        db: AdminDBDep,
    ) -> FileResponse:
        """Get page image."""
        _validate_uuid(document_id, "document_id")
        # Verify ownership
        document = db.get_document_by_token(document_id, admin_token)
        if document is None:
            raise HTTPException(
                status_code=404,
                detail="Document not found or does not belong to this token",
            )
        # Validate page number
        if page_number < 1 or page_number > document.page_count:
            raise HTTPException(
                status_code=404,
                detail=f"Page {page_number} not found. Document has {document.page_count} pages.",
            )
        # Find image file
        image_path = ADMIN_IMAGES_DIR / document_id / f"page_{page_number}.png"
        if not image_path.exists():
            raise HTTPException(
                status_code=404,
                detail=f"Image for page {page_number} not found",
            )
        return FileResponse(
            path=str(image_path),
            media_type="image/png",
            filename=f"{document.filename}_page_{page_number}.png",
        )
    # =========================================================================
    # Annotation Endpoints
    # =========================================================================
    @router.get(
        "/{document_id}/annotations",
        response_model=AnnotationListResponse,
        responses={
            401: {"model": ErrorResponse, "description": "Invalid token"},
            404: {"model": ErrorResponse, "description": "Document not found"},
        },
        summary="List annotations",
        description="Get all annotations for a document.",
    )
    async def list_annotations(
        document_id: str,
        admin_token: AdminTokenDep,
        db: AdminDBDep,
        page_number: Annotated[
            int | None,
            Query(ge=1, description="Filter by page number"),
        ] = None,
    ) -> AnnotationListResponse:
        """List annotations for a document."""
        _validate_uuid(document_id, "document_id")
        # Verify ownership
        document = db.get_document_by_token(document_id, admin_token)
        if document is None:
            raise HTTPException(
                status_code=404,
                detail="Document not found or does not belong to this token",
            )
        # Get annotations
        raw_annotations = db.get_annotations_for_document(document_id, page_number)
        annotations = [
            AnnotationItem(
                annotation_id=str(ann.annotation_id),
                page_number=ann.page_number,
                class_id=ann.class_id,
                class_name=ann.class_name,
                bbox=BoundingBox(
                    x=ann.bbox_x,
                    y=ann.bbox_y,
                    width=ann.bbox_width,
                    height=ann.bbox_height,
                ),
                normalized_bbox={
                    "x_center": ann.x_center,
                    "y_center": ann.y_center,
                    "width": ann.width,
                    "height": ann.height,
                },
                text_value=ann.text_value,
                confidence=ann.confidence,
                source=AnnotationSource(ann.source),
                created_at=ann.created_at,
            )
            for ann in raw_annotations
        ]
        return AnnotationListResponse(
            document_id=document_id,
            page_count=document.page_count,
            total_annotations=len(annotations),
            annotations=annotations,
        )
    @router.post(
        "/{document_id}/annotations",
        response_model=AnnotationResponse,
        responses={
            400: {"model": ErrorResponse, "description": "Invalid request"},
            401: {"model": ErrorResponse, "description": "Invalid token"},
            404: {"model": ErrorResponse, "description": "Document not found"},
        },
        summary="Create annotation",
        description="Create a new annotation for a document.",
    )
    async def create_annotation(
        document_id: str,
        request: AnnotationCreate,
        admin_token: AdminTokenDep,
        db: AdminDBDep,
    ) -> AnnotationResponse:
        """Create a new annotation."""
        _validate_uuid(document_id, "document_id")
        # Verify ownership
        document = db.get_document_by_token(document_id, admin_token)
        if document is None:
            raise HTTPException(
                status_code=404,
                detail="Document not found or does not belong to this token",
            )
        # Validate page number
        if request.page_number > document.page_count:
            raise HTTPException(
                status_code=400,
                detail=f"Page {request.page_number} exceeds document page count ({document.page_count})",
            )
        # Get image dimensions for normalization
        image_path = ADMIN_IMAGES_DIR / document_id / f"page_{request.page_number}.png"
        if not image_path.exists():
            raise HTTPException(
                status_code=400,
                detail=f"Image for page {request.page_number} not available",
            )
        from PIL import Image
        with Image.open(image_path) as img:
            image_width, image_height = img.size
        # Calculate normalized coordinates
        x_center = (request.bbox.x + request.bbox.width / 2) / image_width
        y_center = (request.bbox.y + request.bbox.height / 2) / image_height
        width = request.bbox.width / image_width
        height = request.bbox.height / image_height
        # Get class name
        class_name = FIELD_CLASSES.get(request.class_id, f"class_{request.class_id}")
        # Create annotation
        annotation_id = db.create_annotation(
            document_id=document_id,
            page_number=request.page_number,
            class_id=request.class_id,
            class_name=class_name,
            x_center=x_center,
            y_center=y_center,
            width=width,
            height=height,
            bbox_x=request.bbox.x,
            bbox_y=request.bbox.y,
            bbox_width=request.bbox.width,
            bbox_height=request.bbox.height,
            text_value=request.text_value,
            source="manual",
        )
        # Keep status as pending - user must click "Mark Complete" to finalize
        # This allows user to add multiple annotations before saving to PostgreSQL
        return AnnotationResponse(
            annotation_id=annotation_id,
            message="Annotation created successfully",
        )
    @router.patch(
        "/{document_id}/annotations/{annotation_id}",
        response_model=AnnotationResponse,
        responses={
            400: {"model": ErrorResponse, "description": "Invalid request"},
            401: {"model": ErrorResponse, "description": "Invalid token"},
            404: {"model": ErrorResponse, "description": "Not found"},
        },
        summary="Update annotation",
        description="Update an existing annotation.",
    )
    async def update_annotation(
        document_id: str,
        annotation_id: str,
        request: AnnotationUpdate,
        admin_token: AdminTokenDep,
        db: AdminDBDep,
    ) -> AnnotationResponse:
        """Update an annotation."""
        _validate_uuid(document_id, "document_id")
        _validate_uuid(annotation_id, "annotation_id")
        # Verify ownership
        document = db.get_document_by_token(document_id, admin_token)
        if document is None:
            raise HTTPException(
                status_code=404,
                detail="Document not found or does not belong to this token",
            )
        # Get existing annotation
        annotation = db.get_annotation(annotation_id)
        if annotation is None:
            raise HTTPException(
                status_code=404,
                detail="Annotation not found",
            )
        # Verify annotation belongs to document
        if str(annotation.document_id) != document_id:
            raise HTTPException(
                status_code=404,
                detail="Annotation does not belong to this document",
            )
        # Prepare update data
        update_kwargs = {}
        if request.class_id is not None:
            update_kwargs["class_id"] = request.class_id
            update_kwargs["class_name"] = FIELD_CLASSES.get(
                request.class_id, f"class_{request.class_id}"
            )
        if request.text_value is not None:
            update_kwargs["text_value"] = request.text_value
        if request.bbox is not None:
            # Get image dimensions
            image_path = ADMIN_IMAGES_DIR / document_id / f"page_{annotation.page_number}.png"
            from PIL import Image
            with Image.open(image_path) as img:
                image_width, image_height = img.size
            # Calculate normalized coordinates
            update_kwargs["x_center"] = (request.bbox.x + request.bbox.width / 2) / image_width
            update_kwargs["y_center"] = (request.bbox.y + request.bbox.height / 2) / image_height
            update_kwargs["width"] = request.bbox.width / image_width
            update_kwargs["height"] = request.bbox.height / image_height
            update_kwargs["bbox_x"] = request.bbox.x
            update_kwargs["bbox_y"] = request.bbox.y
            update_kwargs["bbox_width"] = request.bbox.width
            update_kwargs["bbox_height"] = request.bbox.height
        # Update annotation
        if update_kwargs:
            success = db.update_annotation(annotation_id, **update_kwargs)
            if not success:
                raise HTTPException(
                    status_code=500,
                    detail="Failed to update annotation",
                )
        return AnnotationResponse(
            annotation_id=annotation_id,
            message="Annotation updated successfully",
        )
    @router.delete(
        "/{document_id}/annotations/{annotation_id}",
        responses={
            401: {"model": ErrorResponse, "description": "Invalid token"},
            404: {"model": ErrorResponse, "description": "Not found"},
        },
        summary="Delete annotation",
        description="Delete an annotation.",
    )
    async def delete_annotation(
        document_id: str,
        annotation_id: str,
        admin_token: AdminTokenDep,
        db: AdminDBDep,
    ) -> dict:
        """Delete an annotation."""
        _validate_uuid(document_id, "document_id")
        _validate_uuid(annotation_id, "annotation_id")
        # Verify ownership
        document = db.get_document_by_token(document_id, admin_token)
        if document is None:
            raise HTTPException(
                status_code=404,
                detail="Document not found or does not belong to this token",
            )
        # Get existing annotation
        annotation = db.get_annotation(annotation_id)
        if annotation is None:
            raise HTTPException(
                status_code=404,
                detail="Annotation not found",
            )
        # Verify annotation belongs to document
        if str(annotation.document_id) != document_id:
            raise HTTPException(
                status_code=404,
                detail="Annotation does not belong to this document",
            )
        # Delete annotation
        db.delete_annotation(annotation_id)
        return {
            "status": "deleted",
            "annotation_id": annotation_id,
            "message": "Annotation deleted successfully",
        }
    # =========================================================================
    # Auto-Labeling Endpoints
    # =========================================================================
    @router.post(
        "/{document_id}/auto-label",
        response_model=AutoLabelResponse,
        responses={
            400: {"model": ErrorResponse, "description": "Invalid request"},
            401: {"model": ErrorResponse, "description": "Invalid token"},
            404: {"model": ErrorResponse, "description": "Document not found"},
        },
        summary="Trigger auto-labeling",
        description="Trigger auto-labeling for a document using field values.",
    )
    async def trigger_auto_label(
        document_id: str,
        request: AutoLabelRequest,
        admin_token: AdminTokenDep,
        db: AdminDBDep,
    ) -> AutoLabelResponse:
        """Trigger auto-labeling for a document."""
        _validate_uuid(document_id, "document_id")
        # Verify ownership
        document = db.get_document_by_token(document_id, admin_token)
        if document is None:
            raise HTTPException(
                status_code=404,
                detail="Document not found or does not belong to this token",
            )
        # Validate field values
        if not request.field_values:
            raise HTTPException(
                status_code=400,
                detail="At least one field value is required",
            )
        # Run auto-labeling
        service = get_auto_label_service()
        result = service.auto_label_document(
            document_id=document_id,
            file_path=document.file_path,
            field_values=request.field_values,
            db=db,
            replace_existing=request.replace_existing,
        )
        if result["status"] == "failed":
            raise HTTPException(
                status_code=500,
                detail=f"Auto-labeling failed: {result.get('error', 'Unknown error')}",
            )
        return AutoLabelResponse(
            document_id=document_id,
            status=result["status"],
            annotations_created=result["annotations_created"],
            message=f"Auto-labeling completed. Created {result['annotations_created']} annotations.",
        )
    @router.delete(
        "/{document_id}/annotations",
        responses={
            401: {"model": ErrorResponse, "description": "Invalid token"},
            404: {"model": ErrorResponse, "description": "Document not found"},
        },
        summary="Delete all annotations",
        description="Delete all annotations for a document (optionally filter by source).",
    )
    async def delete_all_annotations(
        document_id: str,
        admin_token: AdminTokenDep,
        db: AdminDBDep,
        source: Annotated[
            str | None,
            Query(description="Filter by source (manual, auto, imported)"),
        ] = None,
    ) -> dict:
        """Delete all annotations for a document."""
        _validate_uuid(document_id, "document_id")
        # Validate source
        if source and source not in ("manual", "auto", "imported"):
            raise HTTPException(
                status_code=400,
                detail=f"Invalid source: {source}",
            )
        # Verify ownership
        document = db.get_document_by_token(document_id, admin_token)
        if document is None:
            raise HTTPException(
                status_code=404,
                detail="Document not found or does not belong to this token",
            )
        # Delete annotations
        deleted_count = db.delete_annotations_for_document(document_id, source)
        # Update document status if all annotations deleted
        remaining = db.get_annotations_for_document(document_id)
        if not remaining:
            db.update_document_status(document_id, "pending")
        return {
            "status": "deleted",
            "document_id": document_id,
            "deleted_count": deleted_count,
            "message": f"Deleted {deleted_count} annotations",
        }
    # =========================================================================
    # Phase 5: Annotation Enhancement
    # =========================================================================
    @router.post(
        "/{document_id}/annotations/{annotation_id}/verify",
        response_model=AnnotationVerifyResponse,
        responses={
            401: {"model": ErrorResponse, "description": "Invalid token"},
            404: {"model": ErrorResponse, "description": "Annotation not found"},
        },
        summary="Verify annotation",
        description="Mark an annotation as verified by a human reviewer.",
    )
    async def verify_annotation(
        document_id: str,
        annotation_id: str,
        admin_token: AdminTokenDep,
        db: AdminDBDep,
        request: AnnotationVerifyRequest = AnnotationVerifyRequest(),
    ) -> AnnotationVerifyResponse:
        """Verify an annotation."""
        _validate_uuid(document_id, "document_id")
        _validate_uuid(annotation_id, "annotation_id")
        # Verify ownership of document
        document = db.get_document_by_token(document_id, admin_token)
        if document is None:
            raise HTTPException(
                status_code=404,
                detail="Document not found or does not belong to this token",
            )
        # Verify the annotation
        annotation = db.verify_annotation(annotation_id, admin_token)
        if annotation is None:
            raise HTTPException(
                status_code=404,
                detail="Annotation not found",
            )
        return AnnotationVerifyResponse(
            annotation_id=annotation_id,
            is_verified=annotation.is_verified,
            verified_at=annotation.verified_at,
            verified_by=annotation.verified_by,
            message="Annotation verified successfully",
        )
    @router.patch(
        "/{document_id}/annotations/{annotation_id}/override",
        response_model=AnnotationOverrideResponse,
        responses={
            401: {"model": ErrorResponse, "description": "Invalid token"},
            404: {"model": ErrorResponse, "description": "Annotation not found"},
        },
        summary="Override annotation",
        description="Override an auto-generated annotation with manual corrections.",
    )
    async def override_annotation(
        document_id: str,
        annotation_id: str,
        request: AnnotationOverrideRequest,
        admin_token: AdminTokenDep,
        db: AdminDBDep,
    ) -> AnnotationOverrideResponse:
        """Override an auto-generated annotation."""
        _validate_uuid(document_id, "document_id")
        _validate_uuid(annotation_id, "annotation_id")
        # Verify ownership of document
        document = db.get_document_by_token(document_id, admin_token)
        if document is None:
            raise HTTPException(
                status_code=404,
                detail="Document not found or does not belong to this token",
            )
        # Build updates dict from request
        updates = {}
        if request.text_value is not None:
            updates["text_value"] = request.text_value
        if request.class_id is not None:
            updates["class_id"] = request.class_id
            # Update class_name if class_id changed
            if request.class_id in FIELD_CLASSES:
                updates["class_name"] = FIELD_CLASSES[request.class_id]
        if request.class_name is not None:
            updates["class_name"] = request.class_name
        if request.bbox:
            # Update bbox fields
            if "x" in request.bbox:
                updates["bbox_x"] = request.bbox["x"]
            if "y" in request.bbox:
                updates["bbox_y"] = request.bbox["y"]
            if "width" in request.bbox:
                updates["bbox_width"] = request.bbox["width"]
            if "height" in request.bbox:
                updates["bbox_height"] = request.bbox["height"]
        if not updates:
            raise HTTPException(
                status_code=400,
                detail="No updates provided. Specify at least one field to update.",
            )
        # Override the annotation
        annotation = db.override_annotation(
            annotation_id=annotation_id,
            admin_token=admin_token,
            change_reason=request.reason,
            **updates,
        )
        if annotation is None:
            raise HTTPException(
                status_code=404,
                detail="Annotation not found",
            )
        # Get history to return history_id
        history_records = db.get_annotation_history(UUID(annotation_id))
        latest_history = history_records[0] if history_records else None
        return AnnotationOverrideResponse(
            annotation_id=annotation_id,
            source=annotation.source,
            override_source=annotation.override_source,
            original_annotation_id=str(annotation.original_annotation_id) if annotation.original_annotation_id else None,
            message="Annotation overridden successfully",
            history_id=str(latest_history.history_id) if latest_history else "",
        )
    return router
--- a/src/web/api/v1/admin/auth.py
+++ b/src/web/api/v1/admin/auth.py
@@ -0,0 +1,82 @@
 """
 Admin Auth Routes
 FastAPI endpoints for admin token management.
 """
 import logging
 import secrets
 from datetime import datetime, timedelta
 from fastapi import APIRouter
 from src.web.core.auth import AdminTokenDep, AdminDBDep
 from src.web.schemas.admin import (
    AdminTokenCreate,
    AdminTokenResponse,
 )
 from src.web.schemas.common import ErrorResponse
 logger = logging.getLogger(__name__)
 def create_auth_router() -> APIRouter:
    """Create admin auth router."""
    router = APIRouter(prefix="/admin/auth", tags=["Admin Auth"])
    @router.post(
        "/token",
        response_model=AdminTokenResponse,
        responses={
            400: {"model": ErrorResponse, "description": "Invalid request"},
        },
        summary="Create admin token",
        description="Create a new admin authentication token.",
    )
    async def create_token(
        request: AdminTokenCreate,
        db: AdminDBDep,
    ) -> AdminTokenResponse:
        """Create a new admin token."""
        # Generate secure token
        token = secrets.token_urlsafe(32)
        # Calculate expiration
        expires_at = None
        if request.expires_in_days:
            expires_at = datetime.utcnow() + timedelta(days=request.expires_in_days)
        # Create token in database
        db.create_admin_token(
            token=token,
            name=request.name,
            expires_at=expires_at,
        )
        return AdminTokenResponse(
            token=token,
            name=request.name,
            expires_at=expires_at,
            message="Admin token created successfully",
        )
    @router.delete(
        "/token",
        responses={
            401: {"model": ErrorResponse, "description": "Invalid token"},
        },
        summary="Revoke admin token",
        description="Revoke the current admin token.",
    )
    async def revoke_token(
        admin_token: AdminTokenDep,
        db: AdminDBDep,
    ) -> dict:
        """Revoke the current admin token."""
        db.deactivate_admin_token(admin_token)
        return {
            "status": "revoked",
            "message": "Admin token has been revoked",
        }
    return router
--- a/src/web/api/v1/admin/documents.py
+++ b/src/web/api/v1/admin/documents.py
@@ -0,0 +1,551 @@
 """
 Admin Document Routes
 FastAPI endpoints for admin document management.
 """
 import logging
 from pathlib import Path
 from typing import Annotated
 from uuid import UUID
 from fastapi import APIRouter, File, HTTPException, Query, UploadFile
 from src.web.config import DEFAULT_DPI, StorageConfig
 from src.web.core.auth import AdminTokenDep, AdminDBDep
 from src.web.schemas.admin import (
    AnnotationItem,
    AnnotationSource,
    AutoLabelStatus,
    BoundingBox,
    DocumentDetailResponse,
    DocumentItem,
    DocumentListResponse,
    DocumentStatus,
    DocumentStatsResponse,
    DocumentUploadResponse,
    ModelMetrics,
    TrainingHistoryItem,
 )
 from src.web.schemas.common import ErrorResponse
 logger = logging.getLogger(__name__)
 def _validate_uuid(value: str, name: str = "ID") -> None:
    """Validate UUID format."""
    try:
        UUID(value)
    except ValueError:
        raise HTTPException(
            status_code=400,
            detail=f"Invalid {name} format. Must be a valid UUID.",
        )
 def _convert_pdf_to_images(
    document_id: str, content: bytes, page_count: int, images_dir: Path, dpi: int
 ) -> None:
    """Convert PDF pages to images for annotation."""
    import fitz
    doc_images_dir = images_dir / document_id
    doc_images_dir.mkdir(parents=True, exist_ok=True)
    pdf_doc = fitz.open(stream=content, filetype="pdf")
    for page_num in range(page_count):
        page = pdf_doc[page_num]
        # Render at configured DPI for consistency with training
        mat = fitz.Matrix(dpi / 72, dpi / 72)
        pix = page.get_pixmap(matrix=mat)
        image_path = doc_images_dir / f"page_{page_num + 1}.png"
        pix.save(str(image_path))
    pdf_doc.close()
 def create_documents_router(storage_config: StorageConfig) -> APIRouter:
    """Create admin documents router."""
    router = APIRouter(prefix="/admin/documents", tags=["Admin Documents"])
    # Directories are created by StorageConfig.__post_init__
    allowed_extensions = storage_config.allowed_extensions
    @router.post(
        "",
        response_model=DocumentUploadResponse,
        responses={
            400: {"model": ErrorResponse, "description": "Invalid file"},
            401: {"model": ErrorResponse, "description": "Invalid token"},
        },
        summary="Upload document",
        description="Upload a PDF or image document for labeling.",
    )
    async def upload_document(
        admin_token: AdminTokenDep,
        db: AdminDBDep,
        file: UploadFile = File(..., description="PDF or image file"),
        auto_label: Annotated[
            bool,
            Query(description="Trigger auto-labeling after upload"),
        ] = True,
    ) -> DocumentUploadResponse:
        """Upload a document for labeling."""
        # Validate filename
        if not file.filename:
            raise HTTPException(status_code=400, detail="Filename is required")
        # Validate extension
        file_ext = Path(file.filename).suffix.lower()
        if file_ext not in allowed_extensions:
            raise HTTPException(
                status_code=400,
                detail=f"Unsupported file type: {file_ext}. "
                f"Allowed: {', '.join(allowed_extensions)}",
            )
        # Read file content
        try:
            content = await file.read()
        except Exception as e:
            logger.error(f"Failed to read uploaded file: {e}")
            raise HTTPException(status_code=400, detail="Failed to read file")
        # Get page count (for PDF)
        page_count = 1
        if file_ext == ".pdf":
            try:
                import fitz
                pdf_doc = fitz.open(stream=content, filetype="pdf")
                page_count = len(pdf_doc)
                pdf_doc.close()
            except Exception as e:
                logger.warning(f"Failed to get PDF page count: {e}")
        # Create document record (token only used for auth, not stored)
        document_id = db.create_document(
            filename=file.filename,
            file_size=len(content),
            content_type=file.content_type or "application/octet-stream",
            file_path="",  # Will update after saving
            page_count=page_count,
        )
        # Save file to admin uploads
        file_path = storage_config.admin_upload_dir / f"{document_id}{file_ext}"
        try:
            file_path.write_bytes(content)
        except Exception as e:
            logger.error(f"Failed to save file: {e}")
            raise HTTPException(status_code=500, detail="Failed to save file")
        # Update file path in database
        from src.data.database import get_session_context
        from src.data.admin_models import AdminDocument
        with get_session_context() as session:
            doc = session.get(AdminDocument, UUID(document_id))
            if doc:
                doc.file_path = str(file_path)
                session.add(doc)
        # Convert PDF to images for annotation
        if file_ext == ".pdf":
            try:
                _convert_pdf_to_images(
                    document_id, content, page_count,
                    storage_config.admin_images_dir, storage_config.dpi
                )
            except Exception as e:
                logger.error(f"Failed to convert PDF to images: {e}")
        # Trigger auto-labeling if requested
        auto_label_started = False
        if auto_label:
            # Auto-labeling will be triggered by a background task
            db.update_document_status(
                document_id=document_id,
                status="auto_labeling",
                auto_label_status="running",
            )
            auto_label_started = True
        return DocumentUploadResponse(
            document_id=document_id,
            filename=file.filename,
            file_size=len(content),
            page_count=page_count,
            status=DocumentStatus.AUTO_LABELING if auto_label_started else DocumentStatus.PENDING,
            auto_label_started=auto_label_started,
            message="Document uploaded successfully",
        )
    @router.get(
        "",
        response_model=DocumentListResponse,
        responses={
            401: {"model": ErrorResponse, "description": "Invalid token"},
        },
        summary="List documents",
        description="List all documents for the current admin.",
    )
    async def list_documents(
        admin_token: AdminTokenDep,
        db: AdminDBDep,
        status: Annotated[
            str | None,
            Query(description="Filter by status"),
        ] = None,
        upload_source: Annotated[
            str | None,
            Query(description="Filter by upload source (ui or api)"),
        ] = None,
        has_annotations: Annotated[
            bool | None,
            Query(description="Filter by annotation presence"),
        ] = None,
        auto_label_status: Annotated[
            str | None,
            Query(description="Filter by auto-label status"),
        ] = None,
        batch_id: Annotated[
            str | None,
            Query(description="Filter by batch ID"),
        ] = None,
        limit: Annotated[
            int,
            Query(ge=1, le=100, description="Page size"),
        ] = 20,
        offset: Annotated[
            int,
            Query(ge=0, description="Offset"),
        ] = 0,
    ) -> DocumentListResponse:
        """List documents."""
        # Validate status
        if status and status not in ("pending", "auto_labeling", "labeled", "exported"):
            raise HTTPException(
                status_code=400,
                detail=f"Invalid status: {status}",
            )
        # Validate upload_source
        if upload_source and upload_source not in ("ui", "api"):
            raise HTTPException(
                status_code=400,
                detail=f"Invalid upload_source: {upload_source}",
            )
        # Validate auto_label_status
        if auto_label_status and auto_label_status not in ("pending", "running", "completed", "failed"):
            raise HTTPException(
                status_code=400,
                detail=f"Invalid auto_label_status: {auto_label_status}",
            )
        documents, total = db.get_documents_by_token(
            admin_token=admin_token,
            status=status,
            upload_source=upload_source,
            has_annotations=has_annotations,
            auto_label_status=auto_label_status,
            batch_id=batch_id,
            limit=limit,
            offset=offset,
        )
        # Get annotation counts and build items
        items = []
        for doc in documents:
            annotations = db.get_annotations_for_document(str(doc.document_id))
            # Determine if document can be annotated (not locked)
            can_annotate = True
            if hasattr(doc, 'annotation_lock_until') and doc.annotation_lock_until:
                from datetime import datetime, timezone
                can_annotate = doc.annotation_lock_until < datetime.now(timezone.utc)
            items.append(
                DocumentItem(
                    document_id=str(doc.document_id),
                    filename=doc.filename,
                    file_size=doc.file_size,
                    page_count=doc.page_count,
                    status=DocumentStatus(doc.status),
                    auto_label_status=AutoLabelStatus(doc.auto_label_status) if doc.auto_label_status else None,
                    annotation_count=len(annotations),
                    upload_source=doc.upload_source if hasattr(doc, 'upload_source') else "ui",
                    batch_id=str(doc.batch_id) if hasattr(doc, 'batch_id') and doc.batch_id else None,
                    can_annotate=can_annotate,
                    created_at=doc.created_at,
                    updated_at=doc.updated_at,
                )
            )
        return DocumentListResponse(
            total=total,
            limit=limit,
            offset=offset,
            documents=items,
        )
    @router.get(
        "/stats",
        response_model=DocumentStatsResponse,
        responses={
            401: {"model": ErrorResponse, "description": "Invalid token"},
        },
        summary="Get document statistics",
        description="Get document count by status.",
    )
    async def get_document_stats(
        admin_token: AdminTokenDep,
        db: AdminDBDep,
    ) -> DocumentStatsResponse:
        """Get document statistics."""
        counts = db.count_documents_by_status(admin_token)
        return DocumentStatsResponse(
            total=sum(counts.values()),
            pending=counts.get("pending", 0),
            auto_labeling=counts.get("auto_labeling", 0),
            labeled=counts.get("labeled", 0),
            exported=counts.get("exported", 0),
        )
    @router.get(
        "/{document_id}",
        response_model=DocumentDetailResponse,
        responses={
            401: {"model": ErrorResponse, "description": "Invalid token"},
            404: {"model": ErrorResponse, "description": "Document not found"},
        },
        summary="Get document detail",
        description="Get document details with annotations.",
    )
    async def get_document(
        document_id: str,
        admin_token: AdminTokenDep,
        db: AdminDBDep,
    ) -> DocumentDetailResponse:
        """Get document details."""
        _validate_uuid(document_id, "document_id")
        document = db.get_document_by_token(document_id, admin_token)
        if document is None:
            raise HTTPException(
                status_code=404,
                detail="Document not found or does not belong to this token",
            )
        # Get annotations
        raw_annotations = db.get_annotations_for_document(document_id)
        annotations = [
            AnnotationItem(
                annotation_id=str(ann.annotation_id),
                page_number=ann.page_number,
                class_id=ann.class_id,
                class_name=ann.class_name,
                bbox=BoundingBox(
                    x=ann.bbox_x,
                    y=ann.bbox_y,
                    width=ann.bbox_width,
                    height=ann.bbox_height,
                ),
                normalized_bbox={
                    "x_center": ann.x_center,
                    "y_center": ann.y_center,
                    "width": ann.width,
                    "height": ann.height,
                },
                text_value=ann.text_value,
                confidence=ann.confidence,
                source=AnnotationSource(ann.source),
                created_at=ann.created_at,
            )
            for ann in raw_annotations
        ]
        # Generate image URLs
        image_urls = []
        for page in range(1, document.page_count + 1):
            image_urls.append(f"/api/v1/admin/documents/{document_id}/images/{page}")
        # Determine if document can be annotated (not locked)
        can_annotate = True
        annotation_lock_until = None
        if hasattr(document, 'annotation_lock_until') and document.annotation_lock_until:
            from datetime import datetime, timezone
            annotation_lock_until = document.annotation_lock_until
            can_annotate = document.annotation_lock_until < datetime.now(timezone.utc)
        # Get CSV field values if available
        csv_field_values = None
        if hasattr(document, 'csv_field_values') and document.csv_field_values:
            csv_field_values = document.csv_field_values
        # Get training history (Phase 5)
        training_history = []
        training_links = db.get_document_training_tasks(document.document_id)
        for link in training_links:
            # Get task details
            task = db.get_training_task(str(link.task_id))
            if task:
                # Build metrics
                metrics = None
                if task.metrics_mAP or task.metrics_precision or task.metrics_recall:
                    metrics = ModelMetrics(
                        mAP=task.metrics_mAP,
                        precision=task.metrics_precision,
                        recall=task.metrics_recall,
                    )
                training_history.append(
                    TrainingHistoryItem(
                        task_id=str(link.task_id),
                        name=task.name,
                        trained_at=link.created_at,
                        model_metrics=metrics,
                    )
                )
        return DocumentDetailResponse(
            document_id=str(document.document_id),
            filename=document.filename,
            file_size=document.file_size,
            content_type=document.content_type,
            page_count=document.page_count,
            status=DocumentStatus(document.status),
            auto_label_status=AutoLabelStatus(document.auto_label_status) if document.auto_label_status else None,
            auto_label_error=document.auto_label_error,
            upload_source=document.upload_source if hasattr(document, 'upload_source') else "ui",
            batch_id=str(document.batch_id) if hasattr(document, 'batch_id') and document.batch_id else None,
            csv_field_values=csv_field_values,
            can_annotate=can_annotate,
            annotation_lock_until=annotation_lock_until,
            annotations=annotations,
            image_urls=image_urls,
            training_history=training_history,
            created_at=document.created_at,
            updated_at=document.updated_at,
        )
    @router.delete(
        "/{document_id}",
        responses={
            401: {"model": ErrorResponse, "description": "Invalid token"},
            404: {"model": ErrorResponse, "description": "Document not found"},
        },
        summary="Delete document",
        description="Delete a document and its annotations.",
    )
    async def delete_document(
        document_id: str,
        admin_token: AdminTokenDep,
        db: AdminDBDep,
    ) -> dict:
        """Delete a document."""
        _validate_uuid(document_id, "document_id")
        # Verify ownership
        document = db.get_document_by_token(document_id, admin_token)
        if document is None:
            raise HTTPException(
                status_code=404,
                detail="Document not found or does not belong to this token",
            )
        # Delete file
        file_path = Path(document.file_path)
        if file_path.exists():
            file_path.unlink()
        # Delete images
        images_dir = ADMIN_IMAGES_DIR / document_id
        if images_dir.exists():
            import shutil
            shutil.rmtree(images_dir)
        # Delete from database
        db.delete_document(document_id)
        return {
            "status": "deleted",
            "document_id": document_id,
            "message": "Document deleted successfully",
        }
    @router.patch(
        "/{document_id}/status",
        responses={
            401: {"model": ErrorResponse, "description": "Invalid token"},
            404: {"model": ErrorResponse, "description": "Document not found"},
        },
        summary="Update document status",
        description="Update document status (e.g., mark as labeled). When marking as 'labeled', annotations are saved to PostgreSQL.",
    )
    async def update_document_status(
        document_id: str,
        admin_token: AdminTokenDep,
        db: AdminDBDep,
        status: Annotated[
            str,
            Query(description="New status"),
        ],
    ) -> dict:
        """Update document status.
        When status is set to 'labeled', the annotations are automatically
        saved to PostgreSQL documents/field_results tables for consistency
        with CLI auto-label workflow.
        """
        _validate_uuid(document_id, "document_id")
        # Validate status
        if status not in ("pending", "labeled", "exported"):
            raise HTTPException(
                status_code=400,
                detail=f"Invalid status: {status}",
            )
        # Verify ownership
        document = db.get_document_by_token(document_id, admin_token)
        if document is None:
            raise HTTPException(
                status_code=404,
                detail="Document not found or does not belong to this token",
            )
        # If marking as labeled, save annotations to PostgreSQL DocumentDB
        db_save_result = None
        if status == "labeled":
            from src.web.services.db_autolabel import save_manual_annotations_to_document_db
            # Get all annotations for this document
            annotations = db.get_annotations_for_document(document_id)
            if annotations:
                db_save_result = save_manual_annotations_to_document_db(
                    document=document,
                    annotations=annotations,
                    db=db,
                )
        db.update_document_status(document_id, status)
        response = {
            "status": "updated",
            "document_id": document_id,
            "new_status": status,
            "message": "Document status updated",
        }
        # Include PostgreSQL save result if applicable
        if db_save_result:
            response["document_db_saved"] = db_save_result.get("success", False)
            response["fields_saved"] = db_save_result.get("fields_saved", 0)
        return response
    return router
--- a/src/web/api/v1/admin/locks.py
+++ b/src/web/api/v1/admin/locks.py
@@ -0,0 +1,184 @@
 """
 Admin Document Lock Routes
 FastAPI endpoints for annotation lock management.
 """
 import logging
 from typing import Annotated
 from uuid import UUID
 from fastapi import APIRouter, HTTPException, Query
 from src.web.core.auth import AdminTokenDep, AdminDBDep
 from src.web.schemas.admin import (
    AnnotationLockRequest,
    AnnotationLockResponse,
 )
 from src.web.schemas.common import ErrorResponse
 logger = logging.getLogger(__name__)
 def _validate_uuid(value: str, name: str = "ID") -> None:
    """Validate UUID format."""
    try:
        UUID(value)
    except ValueError:
        raise HTTPException(
            status_code=400,
            detail=f"Invalid {name} format. Must be a valid UUID.",
        )
 def create_locks_router() -> APIRouter:
    """Create annotation locks router."""
    router = APIRouter(prefix="/admin/documents", tags=["Admin Locks"])
    @router.post(
        "/{document_id}/lock",
        response_model=AnnotationLockResponse,
        responses={
            401: {"model": ErrorResponse, "description": "Invalid token"},
            404: {"model": ErrorResponse, "description": "Document not found"},
            409: {"model": ErrorResponse, "description": "Document already locked"},
        },
        summary="Acquire annotation lock",
        description="Acquire a lock on a document to prevent concurrent annotation edits.",
    )
    async def acquire_lock(
        document_id: str,
        admin_token: AdminTokenDep,
        db: AdminDBDep,
        request: AnnotationLockRequest = AnnotationLockRequest(),
    ) -> AnnotationLockResponse:
        """Acquire annotation lock for a document."""
        _validate_uuid(document_id, "document_id")
        # Verify ownership
        document = db.get_document_by_token(document_id, admin_token)
        if document is None:
            raise HTTPException(
                status_code=404,
                detail="Document not found or does not belong to this token",
            )
        # Attempt to acquire lock
        updated_doc = db.acquire_annotation_lock(
            document_id=document_id,
            admin_token=admin_token,
            duration_seconds=request.duration_seconds,
        )
        if updated_doc is None:
            raise HTTPException(
                status_code=409,
                detail="Document is already locked. Please try again later.",
            )
        return AnnotationLockResponse(
            document_id=document_id,
            locked=True,
            lock_expires_at=updated_doc.annotation_lock_until,
            message=f"Lock acquired for {request.duration_seconds} seconds",
        )
    @router.delete(
        "/{document_id}/lock",
        response_model=AnnotationLockResponse,
        responses={
            401: {"model": ErrorResponse, "description": "Invalid token"},
            404: {"model": ErrorResponse, "description": "Document not found"},
        },
        summary="Release annotation lock",
        description="Release the annotation lock on a document.",
    )
    async def release_lock(
        document_id: str,
        admin_token: AdminTokenDep,
        db: AdminDBDep,
        force: Annotated[
            bool,
            Query(description="Force release (admin override)"),
        ] = False,
    ) -> AnnotationLockResponse:
        """Release annotation lock for a document."""
        _validate_uuid(document_id, "document_id")
        # Verify ownership
        document = db.get_document_by_token(document_id, admin_token)
        if document is None:
            raise HTTPException(
                status_code=404,
                detail="Document not found or does not belong to this token",
            )
        # Release lock
        updated_doc = db.release_annotation_lock(
            document_id=document_id,
            admin_token=admin_token,
            force=force,
        )
        if updated_doc is None:
            raise HTTPException(
                status_code=404,
                detail="Failed to release lock",
            )
        return AnnotationLockResponse(
            document_id=document_id,
            locked=False,
            lock_expires_at=None,
            message="Lock released successfully",
        )
    @router.patch(
        "/{document_id}/lock",
        response_model=AnnotationLockResponse,
        responses={
            401: {"model": ErrorResponse, "description": "Invalid token"},
            404: {"model": ErrorResponse, "description": "Document not found"},
            409: {"model": ErrorResponse, "description": "Lock expired or doesn't exist"},
        },
        summary="Extend annotation lock",
        description="Extend an existing annotation lock.",
    )
    async def extend_lock(
        document_id: str,
        admin_token: AdminTokenDep,
        db: AdminDBDep,
        request: AnnotationLockRequest = AnnotationLockRequest(),
    ) -> AnnotationLockResponse:
        """Extend annotation lock for a document."""
        _validate_uuid(document_id, "document_id")
        # Verify ownership
        document = db.get_document_by_token(document_id, admin_token)
        if document is None:
            raise HTTPException(
                status_code=404,
                detail="Document not found or does not belong to this token",
            )
        # Attempt to extend lock
        updated_doc = db.extend_annotation_lock(
            document_id=document_id,
            admin_token=admin_token,
            additional_seconds=request.duration_seconds,
        )
        if updated_doc is None:
            raise HTTPException(
                status_code=409,
                detail="Lock doesn't exist or has expired. Please acquire a new lock.",
            )
        return AnnotationLockResponse(
            document_id=document_id,
            locked=True,
            lock_expires_at=updated_doc.annotation_lock_until,
            message=f"Lock extended by {request.duration_seconds} seconds",
        )
    return router
--- a/src/web/api/v1/admin/training.py
+++ b/src/web/api/v1/admin/training.py
@@ -0,0 +1,622 @@
 """
 Admin Training API Routes
 FastAPI endpoints for training task management and scheduling.
 """
 import logging
 from datetime import datetime
 from typing import Annotated, Any
 from uuid import UUID
 from fastapi import APIRouter, HTTPException, Query
 from src.data.admin_db import AdminDB
 from src.web.core.auth import AdminTokenDep, AdminDBDep
 from src.web.schemas.admin import (
    ExportRequest,
    ExportResponse,
    ModelMetrics,
    TrainingConfig,
    TrainingDocumentItem,
    TrainingDocumentsResponse,
    TrainingHistoryItem,
    TrainingLogItem,
    TrainingLogsResponse,
    TrainingModelItem,
    TrainingModelsResponse,
    TrainingStatus,
    TrainingTaskCreate,
    TrainingTaskDetailResponse,
    TrainingTaskItem,
    TrainingTaskListResponse,
    TrainingTaskResponse,
    TrainingType,
 )
 from src.web.schemas.common import ErrorResponse
 logger = logging.getLogger(__name__)
 def _validate_uuid(value: str, name: str = "ID") -> None:
    """Validate UUID format."""
    try:
        UUID(value)
    except ValueError:
        raise HTTPException(
            status_code=400,
            detail=f"Invalid {name} format. Must be a valid UUID.",
        )
 def create_training_router() -> APIRouter:
    """Create training API router."""
    router = APIRouter(prefix="/admin/training", tags=["Admin Training"])
    # =========================================================================
    # Training Task Endpoints
    # =========================================================================
    @router.post(
        "/tasks",
        response_model=TrainingTaskResponse,
        responses={
            400: {"model": ErrorResponse, "description": "Invalid request"},
            401: {"model": ErrorResponse, "description": "Invalid token"},
        },
        summary="Create training task",
        description="Create a new training task.",
    )
    async def create_training_task(
        request: TrainingTaskCreate,
        admin_token: AdminTokenDep,
        db: AdminDBDep,
    ) -> TrainingTaskResponse:
        """Create a new training task."""
        # Convert config to dict
        config_dict = request.config.model_dump() if request.config else {}
        # Create task
        task_id = db.create_training_task(
            admin_token=admin_token,
            name=request.name,
            task_type=request.task_type.value,
            description=request.description,
            config=config_dict,
            scheduled_at=request.scheduled_at,
            cron_expression=request.cron_expression,
            is_recurring=bool(request.cron_expression),
        )
        return TrainingTaskResponse(
            task_id=task_id,
            status=TrainingStatus.SCHEDULED if request.scheduled_at else TrainingStatus.PENDING,
            message="Training task created successfully",
        )
    @router.get(
        "/tasks",
        response_model=TrainingTaskListResponse,
        responses={
            401: {"model": ErrorResponse, "description": "Invalid token"},
        },
        summary="List training tasks",
        description="List all training tasks.",
    )
    async def list_training_tasks(
        admin_token: AdminTokenDep,
        db: AdminDBDep,
        status: Annotated[
            str | None,
            Query(description="Filter by status"),
        ] = None,
        limit: Annotated[
            int,
            Query(ge=1, le=100, description="Page size"),
        ] = 20,
        offset: Annotated[
            int,
            Query(ge=0, description="Offset"),
        ] = 0,
    ) -> TrainingTaskListResponse:
        """List training tasks."""
        # Validate status
        valid_statuses = ("pending", "scheduled", "running", "completed", "failed", "cancelled")
        if status and status not in valid_statuses:
            raise HTTPException(
                status_code=400,
                detail=f"Invalid status: {status}. Must be one of: {', '.join(valid_statuses)}",
            )
        tasks, total = db.get_training_tasks_by_token(
            admin_token=admin_token,
            status=status,
            limit=limit,
            offset=offset,
        )
        items = [
            TrainingTaskItem(
                task_id=str(task.task_id),
                name=task.name,
                task_type=TrainingType(task.task_type),
                status=TrainingStatus(task.status),
                scheduled_at=task.scheduled_at,
                is_recurring=task.is_recurring,
                started_at=task.started_at,
                completed_at=task.completed_at,
                created_at=task.created_at,
            )
            for task in tasks
        ]
        return TrainingTaskListResponse(
            total=total,
            limit=limit,
            offset=offset,
            tasks=items,
        )
    @router.get(
        "/tasks/{task_id}",
        response_model=TrainingTaskDetailResponse,
        responses={
            401: {"model": ErrorResponse, "description": "Invalid token"},
            404: {"model": ErrorResponse, "description": "Task not found"},
        },
        summary="Get training task detail",
        description="Get training task details.",
    )
    async def get_training_task(
        task_id: str,
        admin_token: AdminTokenDep,
        db: AdminDBDep,
    ) -> TrainingTaskDetailResponse:
        """Get training task details."""
        _validate_uuid(task_id, "task_id")
        task = db.get_training_task_by_token(task_id, admin_token)
        if task is None:
            raise HTTPException(
                status_code=404,
                detail="Training task not found or does not belong to this token",
            )
        return TrainingTaskDetailResponse(
            task_id=str(task.task_id),
            name=task.name,
            description=task.description,
            task_type=TrainingType(task.task_type),
            status=TrainingStatus(task.status),
            config=task.config,
            scheduled_at=task.scheduled_at,
            cron_expression=task.cron_expression,
            is_recurring=task.is_recurring,
            started_at=task.started_at,
            completed_at=task.completed_at,
            error_message=task.error_message,
            result_metrics=task.result_metrics,
            model_path=task.model_path,
            created_at=task.created_at,
        )
    @router.post(
        "/tasks/{task_id}/cancel",
        response_model=TrainingTaskResponse,
        responses={
            401: {"model": ErrorResponse, "description": "Invalid token"},
            404: {"model": ErrorResponse, "description": "Task not found"},
            409: {"model": ErrorResponse, "description": "Cannot cancel task"},
        },
        summary="Cancel training task",
        description="Cancel a pending or scheduled training task.",
    )
    async def cancel_training_task(
        task_id: str,
        admin_token: AdminTokenDep,
        db: AdminDBDep,
    ) -> TrainingTaskResponse:
        """Cancel a training task."""
        _validate_uuid(task_id, "task_id")
        # Verify ownership
        task = db.get_training_task_by_token(task_id, admin_token)
        if task is None:
            raise HTTPException(
                status_code=404,
                detail="Training task not found or does not belong to this token",
            )
        # Check if can be cancelled
        if task.status not in ("pending", "scheduled"):
            raise HTTPException(
                status_code=409,
                detail=f"Cannot cancel task with status: {task.status}",
            )
        # Cancel task
        success = db.cancel_training_task(task_id)
        if not success:
            raise HTTPException(
                status_code=500,
                detail="Failed to cancel training task",
            )
        return TrainingTaskResponse(
            task_id=task_id,
            status=TrainingStatus.CANCELLED,
            message="Training task cancelled successfully",
        )
    @router.get(
        "/tasks/{task_id}/logs",
        response_model=TrainingLogsResponse,
        responses={
            401: {"model": ErrorResponse, "description": "Invalid token"},
            404: {"model": ErrorResponse, "description": "Task not found"},
        },
        summary="Get training logs",
        description="Get training task logs.",
    )
    async def get_training_logs(
        task_id: str,
        admin_token: AdminTokenDep,
        db: AdminDBDep,
        limit: Annotated[
            int,
            Query(ge=1, le=500, description="Maximum logs to return"),
        ] = 100,
        offset: Annotated[
            int,
            Query(ge=0, description="Offset"),
        ] = 0,
    ) -> TrainingLogsResponse:
        """Get training logs."""
        _validate_uuid(task_id, "task_id")
        # Verify ownership
        task = db.get_training_task_by_token(task_id, admin_token)
        if task is None:
            raise HTTPException(
                status_code=404,
                detail="Training task not found or does not belong to this token",
            )
        # Get logs
        logs = db.get_training_logs(task_id, limit, offset)
        items = [
            TrainingLogItem(
                level=log.level,
                message=log.message,
                details=log.details,
                created_at=log.created_at,
            )
            for log in logs
        ]
        return TrainingLogsResponse(
            task_id=task_id,
            logs=items,
        )
    # =========================================================================
    # Phase 4: Training Data Management
    # =========================================================================
    @router.get(
        "/documents",
        response_model=TrainingDocumentsResponse,
        responses={
            401: {"model": ErrorResponse, "description": "Invalid token"},
        },
        summary="Get documents for training",
        description="Get labeled documents available for training with filtering options.",
    )
    async def get_training_documents(
        admin_token: AdminTokenDep,
        db: AdminDBDep,
        has_annotations: Annotated[
            bool,
            Query(description="Only include documents with annotations"),
        ] = True,
        min_annotation_count: Annotated[
            int | None,
            Query(ge=1, description="Minimum annotation count"),
        ] = None,
        exclude_used_in_training: Annotated[
            bool,
            Query(description="Exclude documents already used in training"),
        ] = False,
        limit: Annotated[
            int,
            Query(ge=1, le=100, description="Page size"),
        ] = 100,
        offset: Annotated[
            int,
            Query(ge=0, description="Offset"),
        ] = 0,
    ) -> TrainingDocumentsResponse:
        """Get documents available for training."""
        # Get documents
        documents, total = db.get_documents_for_training(
            admin_token=admin_token,
            status="labeled",
            has_annotations=has_annotations,
            min_annotation_count=min_annotation_count,
            exclude_used_in_training=exclude_used_in_training,
            limit=limit,
            offset=offset,
        )
        # Build response items with annotation details and training history
        items = []
        for doc in documents:
            # Get annotations for this document
            annotations = db.get_annotations_for_document(str(doc.document_id))
            # Count annotations by source
            sources = {"manual": 0, "auto": 0}
            for ann in annotations:
                if ann.source in sources:
                    sources[ann.source] += 1
            # Get training history
            training_links = db.get_document_training_tasks(doc.document_id)
            used_in_training = [str(link.task_id) for link in training_links]
            items.append(
                TrainingDocumentItem(
                    document_id=str(doc.document_id),
                    filename=doc.filename,
                    annotation_count=len(annotations),
                    annotation_sources=sources,
                    used_in_training=used_in_training,
                    last_modified=doc.updated_at,
                )
            )
        return TrainingDocumentsResponse(
            total=total,
            limit=limit,
            offset=offset,
            documents=items,
        )
    @router.get(
        "/models/{task_id}/download",
        responses={
            401: {"model": ErrorResponse, "description": "Invalid token"},
            404: {"model": ErrorResponse, "description": "Model not found"},
        },
        summary="Download trained model",
        description="Download trained model weights file.",
    )
    async def download_model(
        task_id: str,
        admin_token: AdminTokenDep,
        db: AdminDBDep,
    ):
        """Download trained model."""
        from fastapi.responses import FileResponse
        from pathlib import Path
        _validate_uuid(task_id, "task_id")
        # Verify ownership
        task = db.get_training_task_by_token(task_id, admin_token)
        if task is None:
            raise HTTPException(
                status_code=404,
                detail="Training task not found or does not belong to this token",
            )
        # Check if model exists
        if not task.model_path:
            raise HTTPException(
                status_code=404,
                detail="Model file not available for this task",
            )
        model_path = Path(task.model_path)
        if not model_path.exists():
            raise HTTPException(
                status_code=404,
                detail="Model file not found on disk",
            )
        return FileResponse(
            path=str(model_path),
            media_type="application/octet-stream",
            filename=f"{task.name}_model.pt",
        )
    @router.get(
        "/models",
        response_model=TrainingModelsResponse,
        responses={
            401: {"model": ErrorResponse, "description": "Invalid token"},
        },
        summary="Get trained models",
        description="Get list of trained models with metrics and download links.",
    )
    async def get_training_models(
        admin_token: AdminTokenDep,
        db: AdminDBDep,
        status: Annotated[
            str | None,
            Query(description="Filter by status (completed, failed, etc.)"),
        ] = None,
        limit: Annotated[
            int,
            Query(ge=1, le=100, description="Page size"),
        ] = 20,
        offset: Annotated[
            int,
            Query(ge=0, description="Offset"),
        ] = 0,
    ) -> TrainingModelsResponse:
        """Get list of trained models."""
        # Get training tasks
        tasks, total = db.get_training_tasks_by_token(
            admin_token=admin_token,
            status=status if status else "completed",
            limit=limit,
            offset=offset,
        )
        # Build response items
        items = []
        for task in tasks:
            # Build metrics
            metrics = ModelMetrics(
                mAP=task.metrics_mAP,
                precision=task.metrics_precision,
                recall=task.metrics_recall,
            )
            # Build download URL if model exists
            download_url = None
            if task.model_path and task.status == "completed":
                download_url = f"/api/v1/admin/training/models/{task.task_id}/download"
            items.append(
                TrainingModelItem(
                    task_id=str(task.task_id),
                    name=task.name,
                    status=TrainingStatus(task.status),
                    document_count=task.document_count,
                    created_at=task.created_at,
                    completed_at=task.completed_at,
                    metrics=metrics,
                    model_path=task.model_path,
                    download_url=download_url,
                )
            )
        return TrainingModelsResponse(
            total=total,
            limit=limit,
            offset=offset,
            models=items,
        )
    # =========================================================================
    # Export Endpoints
    # =========================================================================
    @router.post(
        "/export",
        response_model=ExportResponse,
        responses={
            400: {"model": ErrorResponse, "description": "Invalid request"},
            401: {"model": ErrorResponse, "description": "Invalid token"},
        },
        summary="Export annotations",
        description="Export annotations in YOLO format for training.",
    )
    async def export_annotations(
        request: ExportRequest,
        admin_token: AdminTokenDep,
        db: AdminDBDep,
    ) -> ExportResponse:
        """Export annotations for training."""
        from pathlib import Path
        import shutil
        # Validate format
        if request.format not in ("yolo", "coco", "voc"):
            raise HTTPException(
                status_code=400,
                detail=f"Unsupported export format: {request.format}",
            )
        # Get labeled documents
        documents = db.get_labeled_documents_for_export(admin_token)
        if not documents:
            raise HTTPException(
                status_code=400,
                detail="No labeled documents available for export",
            )
        # Create export directory
        export_dir = Path("data/exports") / f"export_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}"
        export_dir.mkdir(parents=True, exist_ok=True)
        # YOLO format directories
        (export_dir / "images" / "train").mkdir(parents=True, exist_ok=True)
        (export_dir / "images" / "val").mkdir(parents=True, exist_ok=True)
        (export_dir / "labels" / "train").mkdir(parents=True, exist_ok=True)
        (export_dir / "labels" / "val").mkdir(parents=True, exist_ok=True)
        # Calculate train/val split
        total_docs = len(documents)
        train_count = int(total_docs * request.split_ratio)
        train_docs = documents[:train_count]
        val_docs = documents[train_count:]
        total_images = 0
        total_annotations = 0
        # Export documents
        for split, docs in [("train", train_docs), ("val", val_docs)]:
            for doc in docs:
                # Get annotations
                annotations = db.get_annotations_for_document(str(doc.document_id))
                if not annotations:
                    continue
                # Export each page
                for page_num in range(1, doc.page_count + 1):
                    page_annotations = [a for a in annotations if a.page_number == page_num]
                    if not page_annotations and not request.include_images:
                        continue
                    # Copy image
                    src_image = Path("data/admin_images") / str(doc.document_id) / f"page_{page_num}.png"
                    if not src_image.exists():
                        continue
                    image_name = f"{doc.document_id}_page{page_num}.png"
                    dst_image = export_dir / "images" / split / image_name
                    shutil.copy(src_image, dst_image)
                    total_images += 1
                    # Write YOLO label file
                    label_name = f"{doc.document_id}_page{page_num}.txt"
                    label_path = export_dir / "labels" / split / label_name
                    with open(label_path, "w") as f:
                        for ann in page_annotations:
                            # YOLO format: class_id x_center y_center width height
                            line = f"{ann.class_id} {ann.x_center:.6f} {ann.y_center:.6f} {ann.width:.6f} {ann.height:.6f}\n"
                            f.write(line)
                            total_annotations += 1
        # Create data.yaml
        from src.data.admin_models import FIELD_CLASSES
        yaml_content = f"""# Auto-generated YOLO dataset config
 path: {export_dir.absolute()}
 train: images/train
 val: images/val
 nc: {len(FIELD_CLASSES)}
 names: {list(FIELD_CLASSES.values())}
 """
        (export_dir / "data.yaml").write_text(yaml_content)
        return ExportResponse(
            status="completed",
            export_path=str(export_dir),
            total_images=total_images,
            total_annotations=total_annotations,
            train_count=len(train_docs),
            val_count=len(val_docs),
            message=f"Exported {total_images} images with {total_annotations} annotations",
        )
    return router
--- a/src/web/api/v1/batch/init.py
+++ b/src/web/api/v1/batch/init.py
--- a/src/web/api/v1/batch/routes.py
+++ b/src/web/api/v1/batch/routes.py
@@ -0,0 +1,236 @@
 """
 Batch Upload API Routes
 Endpoints for batch uploading documents via ZIP files with CSV metadata.
 """
 import io
 import logging
 import zipfile
 from datetime import datetime
 from typing import Annotated
 from uuid import UUID
 from fastapi import APIRouter, Depends, File, HTTPException, UploadFile, Form
 from fastapi.responses import JSONResponse
 from src.data.admin_db import AdminDB
 from src.web.core.auth import validate_admin_token, get_admin_db
 from src.web.services.batch_upload import BatchUploadService, MAX_COMPRESSED_SIZE, MAX_UNCOMPRESSED_SIZE
 from src.web.workers.batch_queue import BatchTask, get_batch_queue
 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/api/v1/admin/batch", tags=["batch-upload"])
@router.post("/upload")
 async def upload_batch(
    file: UploadFile = File(...),
    upload_source: str = Form(default="ui"),
    async_mode: bool = Form(default=True),
    auto_label: bool = Form(default=True),
    admin_token: Annotated[str, Depends(validate_admin_token)] = None,
    admin_db: Annotated[AdminDB, Depends(get_admin_db)] = None,
 ) -> dict:
    """Upload a batch of documents via ZIP file.
    The ZIP file can contain:
    - Multiple PDF files
    - Optional CSV file with field values for auto-labeling
    CSV format:
    - Required column: DocumentId (matches PDF filename without extension)
    - Optional columns: InvoiceNumber, InvoiceDate, InvoiceDueDate, Amount,
      OCR, Bankgiro, Plusgiro, customer_number, supplier_organisation_number
    Args:
        file: ZIP file upload
        upload_source: Upload source (ui or api)
        admin_token: Admin authentication token
        admin_db: Admin database interface
    Returns:
        Batch upload result with batch_id and status
    """
    if not file.filename.lower().endswith('.zip'):
        raise HTTPException(status_code=400, detail="Only ZIP files are supported")
    # Check compressed size
    if file.size and file.size > MAX_COMPRESSED_SIZE:
        max_mb = MAX_COMPRESSED_SIZE / (1024 * 1024)
        raise HTTPException(
            status_code=400,
            detail=f"File size exceeds {max_mb:.0f}MB limit"
        )
    try:
        # Read file content
        zip_content = await file.read()
        # Additional security validation before processing
        try:
            with zipfile.ZipFile(io.BytesIO(zip_content)) as test_zip:
                # Quick validation of ZIP structure
                test_zip.testzip()
        except zipfile.BadZipFile:
            raise HTTPException(status_code=400, detail="Invalid ZIP file format")
        if async_mode:
            # Async mode: Queue task and return immediately
            from uuid import uuid4
            batch_id = uuid4()
            # Create batch task for background processing
            task = BatchTask(
                batch_id=batch_id,
                admin_token=admin_token,
                zip_content=zip_content,
                zip_filename=file.filename,
                upload_source=upload_source,
                auto_label=auto_label,
                created_at=datetime.utcnow(),
            )
            # Submit to queue
            queue = get_batch_queue()
            if not queue.submit(task):
                raise HTTPException(
                    status_code=503,
                    detail="Processing queue is full. Please try again later."
                )
            logger.info(
                f"Batch upload queued: batch_id={batch_id}, "
                f"filename={file.filename}, async_mode=True"
            )
            # Return 202 Accepted with batch_id and status URL
            return JSONResponse(
                status_code=202,
                content={
                    "status": "accepted",
                    "batch_id": str(batch_id),
                    "message": "Batch upload queued for processing",
                    "status_url": f"/api/v1/admin/batch/status/{batch_id}",
                    "queue_depth": queue.get_queue_depth(),
                }
            )
        else:
            # Sync mode: Process immediately and return results
            service = BatchUploadService(admin_db)
            result = service.process_zip_upload(
                admin_token=admin_token,
                zip_filename=file.filename,
                zip_content=zip_content,
                upload_source=upload_source,
            )
            logger.info(
                f"Batch upload completed: batch_id={result.get('batch_id')}, "
                f"status={result.get('status')}, files={result.get('successful_files')}"
            )
            return result
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Error processing batch upload: {e}", exc_info=True)
        raise HTTPException(
            status_code=500,
            detail="Failed to process batch upload. Please contact support."
        )
@router.get("/status/{batch_id}")
 async def get_batch_status(
    batch_id: str,
    admin_token: Annotated[str, Depends(validate_admin_token)] = None,
    admin_db: Annotated[AdminDB, Depends(get_admin_db)] = None,
 ) -> dict:
    """Get batch upload status and file processing details.
    Args:
        batch_id: Batch upload ID
        admin_token: Admin authentication token
        admin_db: Admin database interface
    Returns:
        Batch status with file processing details
    """
    # Validate UUID format
    try:
        batch_uuid = UUID(batch_id)
    except ValueError:
        raise HTTPException(status_code=400, detail="Invalid batch ID format")
    # Check batch exists and verify ownership
    batch = admin_db.get_batch_upload(batch_uuid)
    if not batch:
        raise HTTPException(status_code=404, detail="Batch not found")
    # CRITICAL: Verify ownership
    if batch.admin_token != admin_token:
        raise HTTPException(
            status_code=403,
            detail="You do not have access to this batch"
        )
    # Now safe to return details
    service = BatchUploadService(admin_db)
    result = service.get_batch_status(batch_id)
    return result
@router.get("/list")
 async def list_batch_uploads(
    admin_token: Annotated[str, Depends(validate_admin_token)] = None,
    admin_db: Annotated[AdminDB, Depends(get_admin_db)] = None,
    limit: int = 50,
    offset: int = 0,
 ) -> dict:
    """List batch uploads for the current admin token.
    Args:
        admin_token: Admin authentication token
        admin_db: Admin database interface
        limit: Maximum number of results
        offset: Offset for pagination
    Returns:
        List of batch uploads
    """
    # Validate pagination parameters
    if limit < 1 or limit > 100:
        raise HTTPException(status_code=400, detail="Limit must be between 1 and 100")
    if offset < 0:
        raise HTTPException(status_code=400, detail="Offset must be non-negative")
    # Get batch uploads filtered by admin token
    batches, total = admin_db.get_batch_uploads_by_token(
        admin_token=admin_token,
        limit=limit,
        offset=offset,
    )
    return {
        "batches": [
            {
                "batch_id": str(b.batch_id),
                "filename": b.filename,
                "status": b.status,
                "total_files": b.total_files,
                "successful_files": b.successful_files,
                "failed_files": b.failed_files,
                "created_at": b.created_at.isoformat() if b.created_at else None,
                "completed_at": b.completed_at.isoformat() if b.completed_at else None,
            }
            for b in batches
        ],
        "total": total,
        "limit": limit,
        "offset": offset,
    }
--- a/src/web/api/v1/public/init.py
+++ b/src/web/api/v1/public/init.py
@@ -0,0 +1,16 @@
 """
 Public API v1
 Customer-facing endpoints for inference, async processing, and labeling.
 """
 from src.web.api.v1.public.inference import create_inference_router
 from src.web.api.v1.public.async_api import create_async_router, set_async_service
 from src.web.api.v1.public.labeling import create_labeling_router
 __all__ = [
    "create_inference_router",
    "create_async_router",
    "set_async_service",
    "create_labeling_router",
 ]
--- a/src/web/api/v1/public/async_api.py
+++ b/src/web/api/v1/public/async_api.py
@@ -0,0 +1,372 @@
 """
 Async API Routes
 FastAPI endpoints for async invoice processing.
 """
 import logging
 from pathlib import Path
 from typing import Annotated
 from uuid import UUID
 from fastapi import APIRouter, File, HTTPException, Query, UploadFile
 from src.web.dependencies import (
    ApiKeyDep,
    AsyncDBDep,
    PollRateLimitDep,
    SubmitRateLimitDep,
 )
 from src.web.schemas.inference import (
    AsyncRequestItem,
    AsyncRequestsListResponse,
    AsyncResultResponse,
    AsyncStatus,
    AsyncStatusResponse,
    AsyncSubmitResponse,
    DetectionResult,
    InferenceResult,
 )
 from src.web.schemas.common import ErrorResponse
 def _validate_request_id(request_id: str) -> None:
    """Validate that request_id is a valid UUID format."""
    try:
        UUID(request_id)
    except ValueError:
        raise HTTPException(
            status_code=400,
            detail="Invalid request ID format. Must be a valid UUID.",
        )
 logger = logging.getLogger(__name__)
 # Global reference to async processing service (set during app startup)
 _async_service = None
 def set_async_service(service) -> None:
    """Set the async processing service instance."""
    global _async_service
    _async_service = service
 def get_async_service():
    """Get the async processing service instance."""
    if _async_service is None:
        raise RuntimeError("AsyncProcessingService not initialized")
    return _async_service
 def create_async_router(allowed_extensions: tuple[str, ...]) -> APIRouter:
    """Create async API router."""
    router = APIRouter(prefix="/async", tags=["Async Processing"])
    @router.post(
        "/submit",
        response_model=AsyncSubmitResponse,
        responses={
            400: {"model": ErrorResponse, "description": "Invalid file"},
            401: {"model": ErrorResponse, "description": "Invalid API key"},
            429: {"model": ErrorResponse, "description": "Rate limit exceeded"},
            503: {"model": ErrorResponse, "description": "Queue full"},
        },
        summary="Submit PDF for async processing",
        description="Submit a PDF or image file for asynchronous processing. "
        "Returns a request_id that can be used to poll for results.",
    )
    async def submit_document(
        api_key: SubmitRateLimitDep,
        file: UploadFile = File(..., description="PDF or image file to process"),
    ) -> AsyncSubmitResponse:
        """Submit a document for async processing."""
        # Validate filename
        if not file.filename:
            raise HTTPException(status_code=400, detail="Filename is required")
        # Validate file extension
        file_ext = Path(file.filename).suffix.lower()
        if file_ext not in allowed_extensions:
            raise HTTPException(
                status_code=400,
                detail=f"Unsupported file type: {file_ext}. "
                f"Allowed: {', '.join(allowed_extensions)}",
            )
        # Read file content
        try:
            content = await file.read()
        except Exception as e:
            logger.error(f"Failed to read uploaded file: {e}")
            raise HTTPException(status_code=400, detail="Failed to read file")
        # Check file size (get from config via service)
        service = get_async_service()
        max_size = service._async_config.max_file_size_mb * 1024 * 1024
        if len(content) > max_size:
            raise HTTPException(
                status_code=400,
                detail=f"File too large. Maximum size: "
                f"{service._async_config.max_file_size_mb}MB",
            )
        # Submit request
        result = service.submit_request(
            api_key=api_key,
            file_content=content,
            filename=file.filename,
            content_type=file.content_type or "application/octet-stream",
        )
        if not result.success:
            if "queue" in (result.error or "").lower():
                raise HTTPException(status_code=503, detail=result.error)
            raise HTTPException(status_code=500, detail=result.error)
        return AsyncSubmitResponse(
            status="accepted",
            message="Request submitted for processing",
            request_id=result.request_id,
            estimated_wait_seconds=result.estimated_wait_seconds,
            poll_url=f"/api/v1/async/status/{result.request_id}",
        )
    @router.get(
        "/status/{request_id}",
        response_model=AsyncStatusResponse,
        responses={
            401: {"model": ErrorResponse, "description": "Invalid API key"},
            404: {"model": ErrorResponse, "description": "Request not found"},
            429: {"model": ErrorResponse, "description": "Polling too frequently"},
        },
        summary="Get request status",
        description="Get the current processing status of an async request.",
    )
    async def get_status(
        request_id: str,
        api_key: PollRateLimitDep,
        db: AsyncDBDep,
    ) -> AsyncStatusResponse:
        """Get the status of an async request."""
        # Validate UUID format
        _validate_request_id(request_id)
        # Get request from database (validates API key ownership)
        request = db.get_request_by_api_key(request_id, api_key)
        if request is None:
            raise HTTPException(
                status_code=404,
                detail="Request not found or does not belong to this API key",
            )
        # Get queue position for pending requests
        position = None
        if request.status == "pending":
            position = db.get_queue_position(request_id)
        # Build result URL for completed requests
        result_url = None
        if request.status == "completed":
            result_url = f"/api/v1/async/result/{request_id}"
        return AsyncStatusResponse(
            request_id=str(request.request_id),
            status=AsyncStatus(request.status),
            filename=request.filename,
            created_at=request.created_at,
            started_at=request.started_at,
            completed_at=request.completed_at,
            position_in_queue=position,
            error_message=request.error_message,
            result_url=result_url,
        )
    @router.get(
        "/result/{request_id}",
        response_model=AsyncResultResponse,
        responses={
            401: {"model": ErrorResponse, "description": "Invalid API key"},
            404: {"model": ErrorResponse, "description": "Request not found"},
            409: {"model": ErrorResponse, "description": "Request not completed"},
            429: {"model": ErrorResponse, "description": "Polling too frequently"},
        },
        summary="Get extraction results",
        description="Get the extraction results for a completed async request.",
    )
    async def get_result(
        request_id: str,
        api_key: PollRateLimitDep,
        db: AsyncDBDep,
    ) -> AsyncResultResponse:
        """Get the results of a completed async request."""
        # Validate UUID format
        _validate_request_id(request_id)
        # Get request from database (validates API key ownership)
        request = db.get_request_by_api_key(request_id, api_key)
        if request is None:
            raise HTTPException(
                status_code=404,
                detail="Request not found or does not belong to this API key",
            )
        # Check if completed or failed
        if request.status not in ("completed", "failed"):
            raise HTTPException(
                status_code=409,
                detail=f"Request not yet completed. Current status: {request.status}",
            )
        # Build inference result from stored data
        inference_result = None
        if request.result:
            # Convert detections to DetectionResult objects
            detections = []
            for d in request.result.get("detections", []):
                detections.append(DetectionResult(
                    field=d.get("field", ""),
                    confidence=d.get("confidence", 0.0),
                    bbox=d.get("bbox", [0, 0, 0, 0]),
                ))
            inference_result = InferenceResult(
                document_id=request.result.get("document_id", str(request.request_id)[:8]),
                success=request.result.get("success", False),
                document_type=request.result.get("document_type", "invoice"),
                fields=request.result.get("fields", {}),
                confidence=request.result.get("confidence", {}),
                detections=detections,
                processing_time_ms=request.processing_time_ms or 0.0,
                errors=request.result.get("errors", []),
            )
        # Build visualization URL
        viz_url = None
        if request.visualization_path:
            viz_url = f"/api/v1/results/{request.visualization_path}"
        return AsyncResultResponse(
            request_id=str(request.request_id),
            status=AsyncStatus(request.status),
            processing_time_ms=request.processing_time_ms or 0.0,
            result=inference_result,
            visualization_url=viz_url,
        )
    @router.get(
        "/requests",
        response_model=AsyncRequestsListResponse,
        responses={
            401: {"model": ErrorResponse, "description": "Invalid API key"},
        },
        summary="List requests",
        description="List all async requests for the authenticated API key.",
    )
    async def list_requests(
        api_key: ApiKeyDep,
        db: AsyncDBDep,
        status: Annotated[
            str | None,
            Query(description="Filter by status (pending, processing, completed, failed)"),
        ] = None,
        limit: Annotated[
            int,
            Query(ge=1, le=100, description="Maximum number of results"),
        ] = 20,
        offset: Annotated[
            int,
            Query(ge=0, description="Pagination offset"),
        ] = 0,
    ) -> AsyncRequestsListResponse:
        """List all requests for the authenticated API key."""
        # Validate status filter
        if status and status not in ("pending", "processing", "completed", "failed"):
            raise HTTPException(
                status_code=400,
                detail=f"Invalid status filter: {status}. "
                "Must be one of: pending, processing, completed, failed",
            )
        # Get requests from database
        requests, total = db.get_requests_by_api_key(
            api_key=api_key,
            status=status,
            limit=limit,
            offset=offset,
        )
        # Convert to response items
        items = [
            AsyncRequestItem(
                request_id=str(r.request_id),
                status=AsyncStatus(r.status),
                filename=r.filename,
                file_size=r.file_size,
                created_at=r.created_at,
                completed_at=r.completed_at,
            )
            for r in requests
        ]
        return AsyncRequestsListResponse(
            total=total,
            limit=limit,
            offset=offset,
            requests=items,
        )
    @router.delete(
        "/requests/{request_id}",
        responses={
            401: {"model": ErrorResponse, "description": "Invalid API key"},
            404: {"model": ErrorResponse, "description": "Request not found"},
            409: {"model": ErrorResponse, "description": "Cannot delete processing request"},
        },
        summary="Cancel/delete request",
        description="Cancel a pending request or delete a completed/failed request.",
    )
    async def delete_request(
        request_id: str,
        api_key: ApiKeyDep,
        db: AsyncDBDep,
    ) -> dict:
        """Delete or cancel an async request."""
        # Validate UUID format
        _validate_request_id(request_id)
        # Get request from database
        request = db.get_request_by_api_key(request_id, api_key)
        if request is None:
            raise HTTPException(
                status_code=404,
                detail="Request not found or does not belong to this API key",
            )
        # Cannot delete processing requests
        if request.status == "processing":
            raise HTTPException(
                status_code=409,
                detail="Cannot delete a request that is currently processing",
            )
        # Delete from database (will cascade delete related records)
        conn = db.connect()
        with conn.cursor() as cursor:
            cursor.execute(
                "DELETE FROM async_requests WHERE request_id = %s",
                (request_id,),
            )
        conn.commit()
        return {
            "status": "deleted",
            "request_id": request_id,
            "message": "Request deleted successfully",
        }
    return router
--- a/src/web/api/v1/public/inference.py
+++ b/src/web/api/v1/public/inference.py
@@ -1,5 +1,5 @@
 """
-API Routes
+Inference API Routes
 FastAPI route definitions for the inference API.
 """
@@ -15,23 +15,22 @@ from typing import TYPE_CHECKING
 from fastapi import APIRouter, File, HTTPException, UploadFile, status
 from fastapi.responses import FileResponse
-from .schemas import (
+from src.web.schemas.inference import (
    BatchInferenceResponse,
    DetectionResult,
    ErrorResponse,
    HealthResponse,
    InferenceResponse,
    InferenceResult,
 )
 from src.web.schemas.common import ErrorResponse
 if TYPE_CHECKING:
-    from .services import InferenceService
+    from src.web.services import InferenceService
-    from .config import StorageConfig
+    from src.web.config import StorageConfig
 logger = logging.getLogger(__name__)
-def create_api_router(
+def create_inference_router(
    inference_service: "InferenceService",
    storage_config: "StorageConfig",
 ) -> APIRouter:
--- a/src/web/api/v1/public/labeling.py
+++ b/src/web/api/v1/public/labeling.py
@@ -0,0 +1,203 @@
 """
 Labeling API Routes
 FastAPI endpoints for pre-labeling documents with expected field values.
 """
 from __future__ import annotations
 import json
 import logging
 from pathlib import Path
 from typing import TYPE_CHECKING
 from fastapi import APIRouter, Depends, File, Form, HTTPException, UploadFile, status
 from src.data.admin_db import AdminDB
 from src.web.schemas.labeling import PreLabelResponse
 from src.web.schemas.common import ErrorResponse
 if TYPE_CHECKING:
    from src.web.services import InferenceService
    from src.web.config import StorageConfig
 logger = logging.getLogger(__name__)
 # Storage directory for pre-label uploads (legacy, now uses storage_config)
 PRE_LABEL_UPLOAD_DIR = Path("data/pre_label_uploads")
 def _convert_pdf_to_images(
    document_id: str, content: bytes, page_count: int, images_dir: Path, dpi: int
 ) -> None:
    """Convert PDF pages to images for annotation."""
    import fitz
    doc_images_dir = images_dir / document_id
    doc_images_dir.mkdir(parents=True, exist_ok=True)
    pdf_doc = fitz.open(stream=content, filetype="pdf")
    for page_num in range(page_count):
        page = pdf_doc[page_num]
        mat = fitz.Matrix(dpi / 72, dpi / 72)
        pix = page.get_pixmap(matrix=mat)
        image_path = doc_images_dir / f"page_{page_num + 1}.png"
        pix.save(str(image_path))
    pdf_doc.close()
 def get_admin_db() -> AdminDB:
    """Get admin database instance."""
    return AdminDB()
 def create_labeling_router(
    inference_service: "InferenceService",
    storage_config: "StorageConfig",
 ) -> APIRouter:
    """
    Create API router with labeling endpoints.
    Args:
        inference_service: Inference service instance
        storage_config: Storage configuration
    Returns:
        Configured APIRouter
    """
    router = APIRouter(prefix="/api/v1", tags=["labeling"])
    # Ensure upload directory exists
    PRE_LABEL_UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
    @router.post(
        "/pre-label",
        response_model=PreLabelResponse,
        responses={
            400: {"model": ErrorResponse, "description": "Invalid file or field values"},
            500: {"model": ErrorResponse, "description": "Processing error"},
        },
        summary="Pre-label document with expected values",
        description="Upload a document with expected field values for pre-labeling. Returns document_id for result retrieval.",
    )
    async def pre_label(
        file: UploadFile = File(..., description="PDF or image file to process"),
        field_values: str = Form(
            ...,
            description="JSON object with expected field values. "
            "Keys: InvoiceNumber, InvoiceDate, InvoiceDueDate, Amount, OCR, "
            "Bankgiro, Plusgiro, customer_number, supplier_organisation_number",
        ),
        db: AdminDB = Depends(get_admin_db),
    ) -> PreLabelResponse:
        """
        Upload a document with expected field values for pre-labeling.
        Returns document_id which can be used to retrieve results later.
        Example field_values JSON:
        ```json
        {
            "InvoiceNumber": "12345",
            "Amount": "1500.00",
            "Bankgiro": "123-4567",
            "OCR": "1234567890"
        }
        ```
        """
        # Parse field_values JSON
        try:
            expected_values = json.loads(field_values)
            if not isinstance(expected_values, dict):
                raise ValueError("field_values must be a JSON object")
        except json.JSONDecodeError as e:
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail=f"Invalid JSON in field_values: {e}",
            )
        # Validate file extension
        if not file.filename:
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail="Filename is required",
            )
        file_ext = Path(file.filename).suffix.lower()
        if file_ext not in storage_config.allowed_extensions:
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail=f"Unsupported file type: {file_ext}. Allowed: {storage_config.allowed_extensions}",
            )
        # Read file content
        try:
            content = await file.read()
        except Exception as e:
            logger.error(f"Failed to read uploaded file: {e}")
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail="Failed to read file",
            )
        # Get page count for PDF
        page_count = 1
        if file_ext == ".pdf":
            try:
                import fitz
                pdf_doc = fitz.open(stream=content, filetype="pdf")
                page_count = len(pdf_doc)
                pdf_doc.close()
            except Exception as e:
                logger.warning(f"Failed to get PDF page count: {e}")
        # Create document record with field_values
        document_id = db.create_document(
            filename=file.filename,
            file_size=len(content),
            content_type=file.content_type or "application/octet-stream",
            file_path="",  # Will update after saving
            page_count=page_count,
            upload_source="api",
            csv_field_values=expected_values,
        )
        # Save file to admin uploads
        file_path = storage_config.admin_upload_dir / f"{document_id}{file_ext}"
        try:
            file_path.write_bytes(content)
        except Exception as e:
            logger.error(f"Failed to save file: {e}")
            raise HTTPException(
                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                detail="Failed to save file",
            )
        # Update file path in database
        db.update_document_file_path(document_id, str(file_path))
        # Convert PDF to images for annotation UI
        if file_ext == ".pdf":
            try:
                _convert_pdf_to_images(
                    document_id, content, page_count,
                    storage_config.admin_images_dir, storage_config.dpi
                )
            except Exception as e:
                logger.error(f"Failed to convert PDF to images: {e}")
        # Trigger auto-labeling
        db.update_document_status(
            document_id=document_id,
            status="auto_labeling",
            auto_label_status="pending",
        )
        logger.info(f"Pre-label document {document_id} created with {len(expected_values)} expected fields")
        return PreLabelResponse(document_id=document_id)
    return router
--- a/src/web/app.py
+++ b/src/web/app.py
@@ -17,8 +17,39 @@ from fastapi.staticfiles import StaticFiles
 from fastapi.responses import HTMLResponse
 from .config import AppConfig, default_config
-from .routes import create_api_router
+from src.web.services import InferenceService
-from .services import InferenceService
+
 # Public API imports
 from src.web.api.v1.public import (
    create_inference_router,
    create_async_router,
    set_async_service,
    create_labeling_router,
 )
 # Async processing imports
 from src.data.async_request_db import AsyncRequestDB
 from src.web.workers.async_queue import AsyncTaskQueue
 from src.web.services.async_processing import AsyncProcessingService
 from src.web.dependencies import init_dependencies
 from src.web.core.rate_limiter import RateLimiter
 # Admin API imports
 from src.web.api.v1.admin import (
    create_annotation_router,
    create_auth_router,
    create_documents_router,
    create_locks_router,
    create_training_router,
 )
 from src.web.core.scheduler import start_scheduler, stop_scheduler
 from src.web.core.autolabel_scheduler import start_autolabel_scheduler, stop_autolabel_scheduler
 # Batch upload imports
 from src.web.api.v1.batch.routes import router as batch_upload_router
 from src.web.workers.batch_queue import init_batch_queue, shutdown_batch_queue
 from src.web.services.batch_upload import BatchUploadService
 from src.data.admin_db import AdminDB
 if TYPE_CHECKING:
    from collections.abc import AsyncGenerator
@@ -44,11 +75,38 @@ def create_app(config: AppConfig | None = None) -> FastAPI:
        storage_config=config.storage,
    )
    # Create async processing components
    async_db = AsyncRequestDB()
    rate_limiter = RateLimiter(async_db)
    task_queue = AsyncTaskQueue(
        max_size=config.async_processing.queue_max_size,
        worker_count=config.async_processing.worker_count,
    )
    async_service = AsyncProcessingService(
        inference_service=inference_service,
        db=async_db,
        queue=task_queue,
        rate_limiter=rate_limiter,
        async_config=config.async_processing,
        storage_config=config.storage,
    )
    # Initialize dependencies for FastAPI
    init_dependencies(async_db, rate_limiter)
    set_async_service(async_service)
    @asynccontextmanager
    async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
        """Application lifespan manager."""
        logger.info("Starting Invoice Inference API...")
        # Initialize database tables
        try:
            async_db.create_tables()
            logger.info("Async database tables ready")
        except Exception as e:
            logger.error(f"Failed to initialize async database: {e}")
        # Initialize inference service on startup
        try:
            inference_service.initialize()
@@ -57,10 +115,75 @@ def create_app(config: AppConfig | None = None) -> FastAPI:
            logger.error(f"Failed to initialize inference service: {e}")
            # Continue anyway - service will retry on first request
        # Start async processing service
        try:
            async_service.start()
            logger.info("Async processing service started")
        except Exception as e:
            logger.error(f"Failed to start async processing: {e}")
        # Start batch upload queue
        try:
            admin_db = AdminDB()
            batch_service = BatchUploadService(admin_db)
            init_batch_queue(batch_service)
            logger.info("Batch upload queue started")
        except Exception as e:
            logger.error(f"Failed to start batch upload queue: {e}")
        # Start training scheduler
        try:
            start_scheduler()
            logger.info("Training scheduler started")
        except Exception as e:
            logger.error(f"Failed to start training scheduler: {e}")
        # Start auto-label scheduler
        try:
            start_autolabel_scheduler()
            logger.info("AutoLabel scheduler started")
        except Exception as e:
            logger.error(f"Failed to start autolabel scheduler: {e}")
        yield
        logger.info("Shutting down Invoice Inference API...")
        # Stop auto-label scheduler
        try:
            stop_autolabel_scheduler()
            logger.info("AutoLabel scheduler stopped")
        except Exception as e:
            logger.error(f"Error stopping autolabel scheduler: {e}")
        # Stop training scheduler
        try:
            stop_scheduler()
            logger.info("Training scheduler stopped")
        except Exception as e:
            logger.error(f"Error stopping training scheduler: {e}")
        # Stop batch upload queue
        try:
            shutdown_batch_queue()
            logger.info("Batch upload queue stopped")
        except Exception as e:
            logger.error(f"Error stopping batch upload queue: {e}")
        # Stop async processing service
        try:
            async_service.stop(timeout=30.0)
            logger.info("Async processing service stopped")
        except Exception as e:
            logger.error(f"Error stopping async service: {e}")
        # Close database connection
        try:
            async_db.close()
            logger.info("Database connection closed")
        except Exception as e:
            logger.error(f"Error closing database: {e}")
    # Create FastAPI app
    app = FastAPI(
        title="Invoice Field Extraction API",
@@ -106,9 +229,34 @@ def create_app(config: AppConfig | None = None) -> FastAPI:
        name="results",
    )
-    # Include API routes
+    # Include public API routes
-    api_router = create_api_router(inference_service, config.storage)
+    inference_router = create_inference_router(inference_service, config.storage)
-    app.include_router(api_router)
+    app.include_router(inference_router)
    async_router = create_async_router(config.storage.allowed_extensions)
    app.include_router(async_router, prefix="/api/v1")
    labeling_router = create_labeling_router(inference_service, config.storage)
    app.include_router(labeling_router)
    # Include admin API routes
    auth_router = create_auth_router()
    app.include_router(auth_router, prefix="/api/v1")
    documents_router = create_documents_router(config.storage)
    app.include_router(documents_router, prefix="/api/v1")
    locks_router = create_locks_router()
    app.include_router(locks_router, prefix="/api/v1")
    annotation_router = create_annotation_router()
    app.include_router(annotation_router, prefix="/api/v1")
    training_router = create_training_router()
    app.include_router(training_router, prefix="/api/v1")
    # Include batch upload routes
    app.include_router(batch_upload_router)
    # Root endpoint - serve HTML UI
    @app.get("/", response_class=HTMLResponse)
--- a/src/web/config.py
+++ b/src/web/config.py
@@ -8,6 +8,8 @@ from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any
 from src.config import DEFAULT_DPI, PATHS
@dataclass(frozen=True)
 class ModelConfig:
@@ -16,7 +18,7 @@ class ModelConfig:
    model_path: Path = Path("runs/train/invoice_fields/weights/best.pt")
    confidence_threshold: float = 0.5
    use_gpu: bool = True
-    dpi: int = 150
+    dpi: int = DEFAULT_DPI
@dataclass(frozen=True)
@@ -32,19 +34,59 @@ class ServerConfig:
@dataclass(frozen=True)
 class StorageConfig:
-    """File storage configuration."""
+    """File storage configuration.
    Note: admin_upload_dir uses PATHS['pdf_dir'] so uploaded PDFs are stored
    directly in raw_pdfs directory. This ensures consistency with CLI autolabel
    and avoids storing duplicate files.
    """
    upload_dir: Path = Path("uploads")
    result_dir: Path = Path("results")
    admin_upload_dir: Path = field(default_factory=lambda: Path(PATHS["pdf_dir"]))
    admin_images_dir: Path = Path("data/admin_images")
    max_file_size_mb: int = 50
    allowed_extensions: tuple[str, ...] = (".pdf", ".png", ".jpg", ".jpeg")
    dpi: int = DEFAULT_DPI
    def __post_init__(self) -> None:
        """Create directories if they don't exist."""
        object.__setattr__(self, "upload_dir", Path(self.upload_dir))
        object.__setattr__(self, "result_dir", Path(self.result_dir))
        object.__setattr__(self, "admin_upload_dir", Path(self.admin_upload_dir))
        object.__setattr__(self, "admin_images_dir", Path(self.admin_images_dir))
        self.upload_dir.mkdir(parents=True, exist_ok=True)
        self.result_dir.mkdir(parents=True, exist_ok=True)
        self.admin_upload_dir.mkdir(parents=True, exist_ok=True)
        self.admin_images_dir.mkdir(parents=True, exist_ok=True)
@dataclass(frozen=True)
 class AsyncConfig:
    """Async processing configuration."""
    # Queue settings
    queue_max_size: int = 100
    worker_count: int = 1
    task_timeout_seconds: int = 300
    # Rate limiting defaults
    default_requests_per_minute: int = 10
    default_max_concurrent_jobs: int = 3
    default_min_poll_interval_ms: int = 1000
    # Storage
    result_retention_days: int = 7
    temp_upload_dir: Path = Path("uploads/async")
    max_file_size_mb: int = 50
    # Cleanup
    cleanup_interval_hours: int = 1
    def __post_init__(self) -> None:
        """Create directories if they don't exist."""
        object.__setattr__(self, "temp_upload_dir", Path(self.temp_upload_dir))
        self.temp_upload_dir.mkdir(parents=True, exist_ok=True)
@dataclass
@@ -54,6 +96,7 @@ class AppConfig:
    model: ModelConfig = field(default_factory=ModelConfig)
    server: ServerConfig = field(default_factory=ServerConfig)
    storage: StorageConfig = field(default_factory=StorageConfig)
    async_processing: AsyncConfig = field(default_factory=AsyncConfig)
    @classmethod
    def from_dict(cls, config_dict: dict[str, Any]) -> "AppConfig":
@@ -62,6 +105,7 @@ class AppConfig:
            model=ModelConfig(**config_dict.get("model", {})),
            server=ServerConfig(**config_dict.get("server", {})),
            storage=StorageConfig(**config_dict.get("storage", {})),
            async_processing=AsyncConfig(**config_dict.get("async_processing", {})),
        )
--- a/src/web/core/init.py
+++ b/src/web/core/init.py
@@ -0,0 +1,28 @@
 """
 Core Components
 Reusable core functionality: authentication, rate limiting, scheduling.
 """
 from src.web.core.auth import validate_admin_token, get_admin_db, AdminTokenDep, AdminDBDep
 from src.web.core.rate_limiter import RateLimiter
 from src.web.core.scheduler import start_scheduler, stop_scheduler, get_training_scheduler
 from src.web.core.autolabel_scheduler import (
    start_autolabel_scheduler,
    stop_autolabel_scheduler,
    get_autolabel_scheduler,
 )
 __all__ = [
    "validate_admin_token",
    "get_admin_db",
    "AdminTokenDep",
    "AdminDBDep",
    "RateLimiter",
    "start_scheduler",
    "stop_scheduler",
    "get_training_scheduler",
    "start_autolabel_scheduler",
    "stop_autolabel_scheduler",
    "get_autolabel_scheduler",
 ]
--- a/src/web/core/auth.py
+++ b/src/web/core/auth.py
@@ -0,0 +1,60 @@
 """
 Admin Authentication
 FastAPI dependencies for admin token authentication.
 """
 import logging
 from typing import Annotated
 from fastapi import Depends, Header, HTTPException
 from src.data.admin_db import AdminDB
 from src.data.database import get_session_context
 logger = logging.getLogger(__name__)
 # Global AdminDB instance
 _admin_db: AdminDB | None = None
 def get_admin_db() -> AdminDB:
    """Get the AdminDB instance."""
    global _admin_db
    if _admin_db is None:
        _admin_db = AdminDB()
    return _admin_db
 def reset_admin_db() -> None:
    """Reset the AdminDB instance (for testing)."""
    global _admin_db
    _admin_db = None
 async def validate_admin_token(
    x_admin_token: Annotated[str | None, Header()] = None,
    admin_db: AdminDB = Depends(get_admin_db),
 ) -> str:
    """Validate admin token from header."""
    if not x_admin_token:
        raise HTTPException(
            status_code=401,
            detail="Admin token required. Provide X-Admin-Token header.",
        )
    if not admin_db.is_valid_admin_token(x_admin_token):
        raise HTTPException(
            status_code=401,
            detail="Invalid or expired admin token.",
        )
    # Update last used timestamp
    admin_db.update_admin_token_usage(x_admin_token)
    return x_admin_token
 # Type alias for dependency injection
 AdminTokenDep = Annotated[str, Depends(validate_admin_token)]
 AdminDBDep = Annotated[AdminDB, Depends(get_admin_db)]
--- a/src/web/core/autolabel_scheduler.py
+++ b/src/web/core/autolabel_scheduler.py
@@ -0,0 +1,153 @@
 """
 Auto-Label Scheduler
 Background scheduler for processing documents pending auto-labeling.
 """
 import logging
 import threading
 from pathlib import Path
 from src.data.admin_db import AdminDB
 from src.web.services.db_autolabel import (
    get_pending_autolabel_documents,
    process_document_autolabel,
 )
 logger = logging.getLogger(__name__)
 class AutoLabelScheduler:
    """Scheduler for auto-labeling tasks."""
    def __init__(
        self,
        check_interval_seconds: int = 10,
        batch_size: int = 5,
        output_dir: Path | None = None,
    ):
        """
        Initialize auto-label scheduler.
        Args:
            check_interval_seconds: Interval to check for pending tasks
            batch_size: Number of documents to process per batch
            output_dir: Output directory for temporary files
        """
        self._check_interval = check_interval_seconds
        self._batch_size = batch_size
        self._output_dir = output_dir or Path("data/autolabel_output")
        self._running = False
        self._thread: threading.Thread | None = None
        self._stop_event = threading.Event()
        self._db = AdminDB()
    def start(self) -> None:
        """Start the scheduler."""
        if self._running:
            logger.warning("AutoLabel scheduler already running")
            return
        self._running = True
        self._stop_event.clear()
        self._thread = threading.Thread(target=self._run_loop, daemon=True)
        self._thread.start()
        logger.info("AutoLabel scheduler started")
    def stop(self) -> None:
        """Stop the scheduler."""
        if not self._running:
            return
        self._running = False
        self._stop_event.set()
        if self._thread:
            self._thread.join(timeout=5)
            self._thread = None
        logger.info("AutoLabel scheduler stopped")
    @property
    def is_running(self) -> bool:
        """Check if scheduler is running."""
        return self._running
    def _run_loop(self) -> None:
        """Main scheduler loop."""
        while self._running:
            try:
                self._process_pending_documents()
            except Exception as e:
                logger.error(f"Error in autolabel scheduler loop: {e}", exc_info=True)
            # Wait for next check interval
            self._stop_event.wait(timeout=self._check_interval)
    def _process_pending_documents(self) -> None:
        """Check and process pending auto-label documents."""
        try:
            documents = get_pending_autolabel_documents(
                self._db, limit=self._batch_size
            )
            if not documents:
                return
            logger.info(f"Processing {len(documents)} pending autolabel documents")
            for doc in documents:
                if self._stop_event.is_set():
                    break
                try:
                    result = process_document_autolabel(
                        document=doc,
                        db=self._db,
                        output_dir=self._output_dir,
                    )
                    if result.get("success"):
                        logger.info(
                            f"AutoLabel completed for document {doc.document_id}"
                        )
                    else:
                        logger.warning(
                            f"AutoLabel failed for document {doc.document_id}: "
                            f"{result.get('error', 'Unknown error')}"
                        )
                except Exception as e:
                    logger.error(
                        f"Error processing document {doc.document_id}: {e}",
                        exc_info=True,
                    )
        except Exception as e:
            logger.error(f"Error fetching pending documents: {e}", exc_info=True)
 # Global scheduler instance
 _autolabel_scheduler: AutoLabelScheduler | None = None
 def get_autolabel_scheduler() -> AutoLabelScheduler:
    """Get the auto-label scheduler instance."""
    global _autolabel_scheduler
    if _autolabel_scheduler is None:
        _autolabel_scheduler = AutoLabelScheduler()
    return _autolabel_scheduler
 def start_autolabel_scheduler() -> None:
    """Start the global auto-label scheduler."""
    scheduler = get_autolabel_scheduler()
    scheduler.start()
 def stop_autolabel_scheduler() -> None:
    """Stop the global auto-label scheduler."""
    global _autolabel_scheduler
    if _autolabel_scheduler:
        _autolabel_scheduler.stop()
        _autolabel_scheduler = None
--- a/src/web/core/rate_limiter.py
+++ b/src/web/core/rate_limiter.py
@@ -0,0 +1,211 @@
 """
 Rate Limiter Implementation
 Thread-safe rate limiter with sliding window algorithm for API key-based limiting.
 """
 import logging
 import time
 from collections import defaultdict
 from dataclasses import dataclass
 from datetime import datetime, timedelta
 from threading import Lock
 from typing import TYPE_CHECKING
 if TYPE_CHECKING:
    from src.data.async_request_db import AsyncRequestDB
 logger = logging.getLogger(__name__)
@dataclass(frozen=True)
 class RateLimitConfig:
    """Rate limit configuration for an API key."""
    requests_per_minute: int = 10
    max_concurrent_jobs: int = 3
    min_poll_interval_ms: int = 1000  # Minimum time between status polls
@dataclass
 class RateLimitStatus:
    """Current rate limit status."""
    allowed: bool
    remaining_requests: int
    reset_at: datetime
    retry_after_seconds: int | None = None
    reason: str | None = None
 class RateLimiter:
    """
    Thread-safe rate limiter with sliding window algorithm.
    Tracks:
    - Requests per minute (sliding window)
    - Concurrent active jobs
    - Poll frequency per request_id
    """
    def __init__(self, db: "AsyncRequestDB") -> None:
        self._db = db
        self._lock = Lock()
        # In-memory tracking for fast checks
        self._request_windows: dict[str, list[float]] = defaultdict(list)
        # (api_key, request_id) -> last_poll timestamp
        self._poll_timestamps: dict[tuple[str, str], float] = {}
        # Cache for API key configs (TTL 60 seconds)
        self._config_cache: dict[str, tuple[RateLimitConfig, float]] = {}
        self._config_cache_ttl = 60.0
    def check_submit_limit(self, api_key: str) -> RateLimitStatus:
        """Check if API key can submit a new request."""
        config = self._get_config(api_key)
        with self._lock:
            now = time.time()
            window_start = now - 60  # 1 minute window
            # Clean old entries
            self._request_windows[api_key] = [
                ts for ts in self._request_windows[api_key]
                if ts > window_start
            ]
            current_count = len(self._request_windows[api_key])
            if current_count >= config.requests_per_minute:
                oldest = min(self._request_windows[api_key])
                retry_after = int(oldest + 60 - now) + 1
                return RateLimitStatus(
                    allowed=False,
                    remaining_requests=0,
                    reset_at=datetime.utcnow() + timedelta(seconds=retry_after),
                    retry_after_seconds=max(1, retry_after),
                    reason="Rate limit exceeded: too many requests per minute",
                )
            # Check concurrent jobs (query database) - inside lock for thread safety
            active_jobs = self._db.count_active_jobs(api_key)
            if active_jobs >= config.max_concurrent_jobs:
                return RateLimitStatus(
                    allowed=False,
                    remaining_requests=config.requests_per_minute - current_count,
                    reset_at=datetime.utcnow() + timedelta(seconds=30),
                    retry_after_seconds=30,
                    reason=f"Max concurrent jobs ({config.max_concurrent_jobs}) reached",
                )
            return RateLimitStatus(
                allowed=True,
                remaining_requests=config.requests_per_minute - current_count - 1,
                reset_at=datetime.utcnow() + timedelta(seconds=60),
            )
    def record_request(self, api_key: str) -> None:
        """Record a successful request submission."""
        with self._lock:
            self._request_windows[api_key].append(time.time())
        # Also record in database for persistence
        try:
            self._db.record_rate_limit_event(api_key, "request")
        except Exception as e:
            logger.warning(f"Failed to record rate limit event: {e}")
    def check_poll_limit(self, api_key: str, request_id: str) -> RateLimitStatus:
        """Check if polling is allowed (prevent abuse)."""
        config = self._get_config(api_key)
        key = (api_key, request_id)
        with self._lock:
            now = time.time()
            last_poll = self._poll_timestamps.get(key, 0)
            elapsed_ms = (now - last_poll) * 1000
            if elapsed_ms < config.min_poll_interval_ms:
                # Suggest exponential backoff
                wait_ms = min(
                    config.min_poll_interval_ms * 2,
                    5000,  # Max 5 seconds
                )
                retry_after = int(wait_ms / 1000) + 1
                return RateLimitStatus(
                    allowed=False,
                    remaining_requests=0,
                    reset_at=datetime.utcnow() + timedelta(milliseconds=wait_ms),
                    retry_after_seconds=retry_after,
                    reason="Polling too frequently. Please wait before retrying.",
                )
            # Update poll timestamp
            self._poll_timestamps[key] = now
            return RateLimitStatus(
                allowed=True,
                remaining_requests=999,  # No limit on poll count, just frequency
                reset_at=datetime.utcnow(),
            )
    def _get_config(self, api_key: str) -> RateLimitConfig:
        """Get rate limit config for API key with caching."""
        now = time.time()
        # Check cache
        if api_key in self._config_cache:
            cached_config, cached_at = self._config_cache[api_key]
            if now - cached_at < self._config_cache_ttl:
                return cached_config
        # Query database
        db_config = self._db.get_api_key_config(api_key)
        if db_config:
            config = RateLimitConfig(
                requests_per_minute=db_config.requests_per_minute,
                max_concurrent_jobs=db_config.max_concurrent_jobs,
            )
        else:
            config = RateLimitConfig()  # Default limits
        # Cache result
        self._config_cache[api_key] = (config, now)
        return config
    def cleanup_poll_timestamps(self, max_age_seconds: int = 3600) -> int:
        """Clean up old poll timestamps to prevent memory leak."""
        with self._lock:
            now = time.time()
            cutoff = now - max_age_seconds
            old_keys = [
                k for k, v in self._poll_timestamps.items()
                if v < cutoff
            ]
            for key in old_keys:
                del self._poll_timestamps[key]
            return len(old_keys)
    def cleanup_request_windows(self) -> None:
        """Clean up expired entries from request windows."""
        with self._lock:
            now = time.time()
            window_start = now - 60
            for api_key in list(self._request_windows.keys()):
                self._request_windows[api_key] = [
                    ts for ts in self._request_windows[api_key]
                    if ts > window_start
                ]
                # Remove empty entries
                if not self._request_windows[api_key]:
                    del self._request_windows[api_key]
    def get_rate_limit_headers(self, status: RateLimitStatus) -> dict[str, str]:
        """Generate rate limit headers for HTTP response."""
        headers = {
            "X-RateLimit-Remaining": str(status.remaining_requests),
            "X-RateLimit-Reset": status.reset_at.isoformat(),
        }
        if status.retry_after_seconds:
            headers["Retry-After"] = str(status.retry_after_seconds)
        return headers
--- a/src/web/core/scheduler.py
+++ b/src/web/core/scheduler.py
@@ -0,0 +1,329 @@
 """
 Admin Training Scheduler
 Background scheduler for training tasks using APScheduler.
 """
 import logging
 import threading
 from datetime import datetime
 from pathlib import Path
 from typing import Any
 from src.data.admin_db import AdminDB
 logger = logging.getLogger(__name__)
 class TrainingScheduler:
    """Scheduler for training tasks."""
    def __init__(
        self,
        check_interval_seconds: int = 60,
    ):
        """
        Initialize training scheduler.
        Args:
            check_interval_seconds: Interval to check for pending tasks
        """
        self._check_interval = check_interval_seconds
        self._running = False
        self._thread: threading.Thread | None = None
        self._stop_event = threading.Event()
        self._db = AdminDB()
    def start(self) -> None:
        """Start the scheduler."""
        if self._running:
            logger.warning("Training scheduler already running")
            return
        self._running = True
        self._stop_event.clear()
        self._thread = threading.Thread(target=self._run_loop, daemon=True)
        self._thread.start()
        logger.info("Training scheduler started")
    def stop(self) -> None:
        """Stop the scheduler."""
        if not self._running:
            return
        self._running = False
        self._stop_event.set()
        if self._thread:
            self._thread.join(timeout=5)
            self._thread = None
        logger.info("Training scheduler stopped")
    def _run_loop(self) -> None:
        """Main scheduler loop."""
        while self._running:
            try:
                self._check_pending_tasks()
            except Exception as e:
                logger.error(f"Error in scheduler loop: {e}")
            # Wait for next check interval
            self._stop_event.wait(timeout=self._check_interval)
    def _check_pending_tasks(self) -> None:
        """Check and execute pending training tasks."""
        try:
            tasks = self._db.get_pending_training_tasks()
            for task in tasks:
                task_id = str(task.task_id)
                # Check if scheduled time has passed
                if task.scheduled_at and task.scheduled_at > datetime.utcnow():
                    continue
                logger.info(f"Starting training task: {task_id}")
                try:
                    self._execute_task(task_id, task.config or {})
                except Exception as e:
                    logger.error(f"Training task {task_id} failed: {e}")
                    self._db.update_training_task_status(
                        task_id=task_id,
                        status="failed",
                        error_message=str(e),
                    )
        except Exception as e:
            logger.error(f"Error checking pending tasks: {e}")
    def _execute_task(self, task_id: str, config: dict[str, Any]) -> None:
        """Execute a training task."""
        # Update status to running
        self._db.update_training_task_status(task_id, "running")
        self._db.add_training_log(task_id, "INFO", "Training task started")
        try:
            # Get training configuration
            model_name = config.get("model_name", "yolo11n.pt")
            epochs = config.get("epochs", 100)
            batch_size = config.get("batch_size", 16)
            image_size = config.get("image_size", 640)
            learning_rate = config.get("learning_rate", 0.01)
            device = config.get("device", "0")
            project_name = config.get("project_name", "invoice_fields")
            # Export annotations for training
            export_result = self._export_training_data(task_id)
            if not export_result:
                raise ValueError("Failed to export training data")
            data_yaml = export_result["data_yaml"]
            self._db.add_training_log(
                task_id, "INFO",
                f"Exported {export_result['total_images']} images for training",
            )
            # Run YOLO training
            result = self._run_yolo_training(
                task_id=task_id,
                model_name=model_name,
                data_yaml=data_yaml,
                epochs=epochs,
                batch_size=batch_size,
                image_size=image_size,
                learning_rate=learning_rate,
                device=device,
                project_name=project_name,
            )
            # Update task with results
            self._db.update_training_task_status(
                task_id=task_id,
                status="completed",
                result_metrics=result.get("metrics"),
                model_path=result.get("model_path"),
            )
            self._db.add_training_log(task_id, "INFO", "Training completed successfully")
        except Exception as e:
            logger.error(f"Training task {task_id} failed: {e}")
            self._db.add_training_log(task_id, "ERROR", f"Training failed: {e}")
            raise
    def _export_training_data(self, task_id: str) -> dict[str, Any] | None:
        """Export training data for a task."""
        from pathlib import Path
        import shutil
        from src.data.admin_models import FIELD_CLASSES
        # Get all labeled documents
        documents = self._db.get_labeled_documents_for_export()
        if not documents:
            self._db.add_training_log(task_id, "ERROR", "No labeled documents available")
            return None
        # Create export directory
        export_dir = Path("data/training") / task_id
        export_dir.mkdir(parents=True, exist_ok=True)
        # YOLO format directories
        (export_dir / "images" / "train").mkdir(parents=True, exist_ok=True)
        (export_dir / "images" / "val").mkdir(parents=True, exist_ok=True)
        (export_dir / "labels" / "train").mkdir(parents=True, exist_ok=True)
        (export_dir / "labels" / "val").mkdir(parents=True, exist_ok=True)
        # 80/20 train/val split
        total_docs = len(documents)
        train_count = int(total_docs * 0.8)
        train_docs = documents[:train_count]
        val_docs = documents[train_count:]
        total_images = 0
        total_annotations = 0
        # Export documents
        for split, docs in [("train", train_docs), ("val", val_docs)]:
            for doc in docs:
                annotations = self._db.get_annotations_for_document(str(doc.document_id))
                if not annotations:
                    continue
                for page_num in range(1, doc.page_count + 1):
                    page_annotations = [a for a in annotations if a.page_number == page_num]
                    # Copy image
                    src_image = Path("data/admin_images") / str(doc.document_id) / f"page_{page_num}.png"
                    if not src_image.exists():
                        continue
                    image_name = f"{doc.document_id}_page{page_num}.png"
                    dst_image = export_dir / "images" / split / image_name
                    shutil.copy(src_image, dst_image)
                    total_images += 1
                    # Write YOLO label
                    label_name = f"{doc.document_id}_page{page_num}.txt"
                    label_path = export_dir / "labels" / split / label_name
                    with open(label_path, "w") as f:
                        for ann in page_annotations:
                            line = f"{ann.class_id} {ann.x_center:.6f} {ann.y_center:.6f} {ann.width:.6f} {ann.height:.6f}\n"
                            f.write(line)
                            total_annotations += 1
        # Create data.yaml
        yaml_path = export_dir / "data.yaml"
        yaml_content = f"""path: {export_dir.absolute()}
 train: images/train
 val: images/val
 nc: {len(FIELD_CLASSES)}
 names: {list(FIELD_CLASSES.values())}
 """
        yaml_path.write_text(yaml_content)
        return {
            "data_yaml": str(yaml_path),
            "total_images": total_images,
            "total_annotations": total_annotations,
        }
    def _run_yolo_training(
        self,
        task_id: str,
        model_name: str,
        data_yaml: str,
        epochs: int,
        batch_size: int,
        image_size: int,
        learning_rate: float,
        device: str,
        project_name: str,
    ) -> dict[str, Any]:
        """Run YOLO training."""
        try:
            from ultralytics import YOLO
            # Log training start
            self._db.add_training_log(
                task_id, "INFO",
                f"Starting YOLO training: model={model_name}, epochs={epochs}, batch={batch_size}",
            )
            # Load model
            model = YOLO(model_name)
            # Train
            results = model.train(
                data=data_yaml,
                epochs=epochs,
                batch=batch_size,
                imgsz=image_size,
                lr0=learning_rate,
                device=device,
                project=f"runs/train/{project_name}",
                name=f"task_{task_id[:8]}",
                exist_ok=True,
                verbose=True,
            )
            # Get best model path
            best_model = Path(results.save_dir) / "weights" / "best.pt"
            # Extract metrics
            metrics = {}
            if hasattr(results, "results_dict"):
                metrics = {
                    "mAP50": results.results_dict.get("metrics/mAP50(B)", 0),
                    "mAP50-95": results.results_dict.get("metrics/mAP50-95(B)", 0),
                    "precision": results.results_dict.get("metrics/precision(B)", 0),
                    "recall": results.results_dict.get("metrics/recall(B)", 0),
                }
            self._db.add_training_log(
                task_id, "INFO",
                f"Training completed. mAP@0.5: {metrics.get('mAP50', 'N/A')}",
            )
            return {
                "model_path": str(best_model) if best_model.exists() else None,
                "metrics": metrics,
            }
        except ImportError:
            self._db.add_training_log(task_id, "ERROR", "Ultralytics not installed")
            raise ValueError("Ultralytics (YOLO) not installed")
        except Exception as e:
            self._db.add_training_log(task_id, "ERROR", f"YOLO training failed: {e}")
            raise
 # Global scheduler instance
 _scheduler: TrainingScheduler | None = None
 def get_training_scheduler() -> TrainingScheduler:
    """Get the training scheduler instance."""
    global _scheduler
    if _scheduler is None:
        _scheduler = TrainingScheduler()
    return _scheduler
 def start_scheduler() -> None:
    """Start the global training scheduler."""
    scheduler = get_training_scheduler()
    scheduler.start()
 def stop_scheduler() -> None:
    """Stop the global training scheduler."""
    global _scheduler
    if _scheduler:
        _scheduler.stop()
        _scheduler = None
--- a/Show More
+++ b/Show More