Features: - Auto-labeling pipeline: CSV values -> PDF search -> YOLO annotations - Flexible date matching: year-month match, nearby date tolerance - PDF text extraction with PyMuPDF - OCR support for scanned documents (PaddleOCR) - YOLO training and inference pipeline - 7 field types: InvoiceNumber, InvoiceDate, InvoiceDueDate, OCR, Bankgiro, Plusgiro, Amount Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
130 lines
3.1 KiB
YAML
130 lines
3.1 KiB
YAML
# Invoice Master POC v2 Configuration
|
|
# Default configuration for invoice field extraction system
|
|
|
|
# PDF Processing
|
|
pdf:
|
|
dpi: 300 # Resolution for rendering PDFs to images
|
|
min_text_chars: 30 # Minimum chars to consider PDF as text-based
|
|
|
|
# OCR Settings
|
|
ocr:
|
|
engine: paddleocr # OCR engine to use
|
|
lang: en # Language code (en, sv, ch, etc.)
|
|
use_gpu: false # Enable GPU acceleration
|
|
|
|
# Field Normalization
|
|
normalize:
|
|
# Bankgiro formats
|
|
bankgiro:
|
|
format: "XXXX-XXXX" # Standard 8-digit format
|
|
alternatives:
|
|
- "XXX-XXXX" # 7-digit format
|
|
|
|
# Plusgiro formats
|
|
plusgiro:
|
|
format: "XXXXXXX-X" # Standard format with check digit
|
|
|
|
# Amount formats
|
|
amount:
|
|
decimal_separator: "," # Swedish uses comma
|
|
thousand_separator: " " # Space for thousands
|
|
currency_symbols:
|
|
- "SEK"
|
|
- "kr"
|
|
|
|
# Date formats
|
|
date:
|
|
output_format: "%Y-%m-%d"
|
|
input_formats:
|
|
- "%Y-%m-%d"
|
|
- "%Y-%m-%d %H:%M:%S"
|
|
- "%d/%m/%Y"
|
|
- "%d.%m.%Y"
|
|
|
|
# Field Matching
|
|
matching:
|
|
min_score_threshold: 0.7 # Minimum score to accept match
|
|
context_radius: 100 # Pixels to search for context keywords
|
|
|
|
# Context keywords for each field (Swedish)
|
|
context_keywords:
|
|
InvoiceNumber:
|
|
- "fakturanr"
|
|
- "fakturanummer"
|
|
- "invoice"
|
|
InvoiceDate:
|
|
- "fakturadatum"
|
|
- "datum"
|
|
InvoiceDueDate:
|
|
- "förfallodatum"
|
|
- "förfaller"
|
|
- "betalas senast"
|
|
OCR:
|
|
- "ocr"
|
|
- "referens"
|
|
Bankgiro:
|
|
- "bankgiro"
|
|
- "bg"
|
|
Plusgiro:
|
|
- "plusgiro"
|
|
- "pg"
|
|
Amount:
|
|
- "att betala"
|
|
- "summa"
|
|
- "total"
|
|
- "belopp"
|
|
|
|
# YOLO Training
|
|
yolo:
|
|
model: yolov8s # Model architecture (yolov8n/s/m/l/x)
|
|
epochs: 100
|
|
batch_size: 16
|
|
img_size: 1280 # Image size for training
|
|
|
|
# Data augmentation
|
|
augmentation:
|
|
rotation: 5 # Max rotation degrees
|
|
scale: 0.2 # Scale variation
|
|
mosaic: 0.0 # Disable mosaic for documents
|
|
hsv_h: 0.0 # No hue variation
|
|
hsv_s: 0.1 # Slight saturation variation
|
|
hsv_v: 0.2 # Brightness variation
|
|
|
|
# Class definitions
|
|
classes:
|
|
0: invoice_number
|
|
1: invoice_date
|
|
2: invoice_due_date
|
|
3: ocr_number
|
|
4: bankgiro
|
|
5: plusgiro
|
|
6: amount
|
|
|
|
# Auto-labeling
|
|
autolabel:
|
|
min_confidence: 0.7 # Minimum score to include in training
|
|
bbox_padding: 0.02 # Padding around bboxes (fraction of image)
|
|
|
|
# Dataset Split
|
|
dataset:
|
|
train_ratio: 0.8
|
|
val_ratio: 0.1
|
|
test_ratio: 0.1
|
|
random_seed: 42
|
|
|
|
# Inference
|
|
inference:
|
|
confidence_threshold: 0.5 # Detection confidence threshold
|
|
iou_threshold: 0.45 # NMS IOU threshold
|
|
enable_fallback: true # Enable regex fallback if YOLO fails
|
|
fallback_min_missing: 2 # Min missing fields to trigger fallback
|
|
|
|
# Paths (relative to project root)
|
|
paths:
|
|
raw_pdfs: data/raw_pdfs
|
|
images: data/images
|
|
labels: data/labels
|
|
structured_data: data/structured_data
|
|
models: models
|
|
reports: reports
|