Initial commit: Invoice field extraction system using YOLO + OCR

Features:
- Auto-labeling pipeline: CSV values -> PDF search -> YOLO annotations
- Flexible date matching: year-month match, nearby date tolerance
- PDF text extraction with PyMuPDF
- OCR support for scanned documents (PaddleOCR)
- YOLO training and inference pipeline
- 7 field types: InvoiceNumber, InvoiceDate, InvoiceDueDate, OCR, Bankgiro, Plusgiro, Amount

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Yaojia Wang
2026-01-10 17:44:14 +01:00
commit 8938661850
35 changed files with 5020 additions and 0 deletions

129
configs/default.yaml Normal file
View File

@@ -0,0 +1,129 @@
# Invoice Master POC v2 Configuration
# Default configuration for invoice field extraction system
# PDF Processing
pdf:
dpi: 300 # Resolution for rendering PDFs to images
min_text_chars: 30 # Minimum chars to consider PDF as text-based
# OCR Settings
ocr:
engine: paddleocr # OCR engine to use
lang: en # Language code (en, sv, ch, etc.)
use_gpu: false # Enable GPU acceleration
# Field Normalization
normalize:
# Bankgiro formats
bankgiro:
format: "XXXX-XXXX" # Standard 8-digit format
alternatives:
- "XXX-XXXX" # 7-digit format
# Plusgiro formats
plusgiro:
format: "XXXXXXX-X" # Standard format with check digit
# Amount formats
amount:
decimal_separator: "," # Swedish uses comma
thousand_separator: " " # Space for thousands
currency_symbols:
- "SEK"
- "kr"
# Date formats
date:
output_format: "%Y-%m-%d"
input_formats:
- "%Y-%m-%d"
- "%Y-%m-%d %H:%M:%S"
- "%d/%m/%Y"
- "%d.%m.%Y"
# Field Matching
matching:
min_score_threshold: 0.7 # Minimum score to accept match
context_radius: 100 # Pixels to search for context keywords
# Context keywords for each field (Swedish)
context_keywords:
InvoiceNumber:
- "fakturanr"
- "fakturanummer"
- "invoice"
InvoiceDate:
- "fakturadatum"
- "datum"
InvoiceDueDate:
- "förfallodatum"
- "förfaller"
- "betalas senast"
OCR:
- "ocr"
- "referens"
Bankgiro:
- "bankgiro"
- "bg"
Plusgiro:
- "plusgiro"
- "pg"
Amount:
- "att betala"
- "summa"
- "total"
- "belopp"
# YOLO Training
yolo:
model: yolov8s # Model architecture (yolov8n/s/m/l/x)
epochs: 100
batch_size: 16
img_size: 1280 # Image size for training
# Data augmentation
augmentation:
rotation: 5 # Max rotation degrees
scale: 0.2 # Scale variation
mosaic: 0.0 # Disable mosaic for documents
hsv_h: 0.0 # No hue variation
hsv_s: 0.1 # Slight saturation variation
hsv_v: 0.2 # Brightness variation
# Class definitions
classes:
0: invoice_number
1: invoice_date
2: invoice_due_date
3: ocr_number
4: bankgiro
5: plusgiro
6: amount
# Auto-labeling
autolabel:
min_confidence: 0.7 # Minimum score to include in training
bbox_padding: 0.02 # Padding around bboxes (fraction of image)
# Dataset Split
dataset:
train_ratio: 0.8
val_ratio: 0.1
test_ratio: 0.1
random_seed: 42
# Inference
inference:
confidence_threshold: 0.5 # Detection confidence threshold
iou_threshold: 0.45 # NMS IOU threshold
enable_fallback: true # Enable regex fallback if YOLO fails
fallback_min_missing: 2 # Min missing fields to trigger fallback
# Paths (relative to project root)
paths:
raw_pdfs: data/raw_pdfs
images: data/images
labels: data/labels
structured_data: data/structured_data
models: models
reports: reports