Files
invoice-master-poc-v2/configs/default.yaml
Yaojia Wang ad5ed46b4c WIP
2026-02-11 23:40:38 +01:00

130 lines
3.1 KiB
YAML

# Invoice Master POC v2 Configuration
# Default configuration for invoice field extraction system
# PDF Processing
pdf:
dpi: 300 # Resolution for rendering PDFs to images
min_text_chars: 30 # Minimum chars to consider PDF as text-based
# OCR Settings
ocr:
engine: paddleocr # OCR engine to use
lang: en # Language code (en, sv, ch, etc.)
use_gpu: false # Enable GPU acceleration
# Field Normalization
normalize:
# Bankgiro formats
bankgiro:
format: "XXXX-XXXX" # Standard 8-digit format
alternatives:
- "XXX-XXXX" # 7-digit format
# Plusgiro formats
plusgiro:
format: "XXXXXXX-X" # Standard format with check digit
# Amount formats
amount:
decimal_separator: "," # Swedish uses comma
thousand_separator: " " # Space for thousands
currency_symbols:
- "SEK"
- "kr"
# Date formats
date:
output_format: "%Y-%m-%d"
input_formats:
- "%Y-%m-%d"
- "%Y-%m-%d %H:%M:%S"
- "%d/%m/%Y"
- "%d.%m.%Y"
# Field Matching
matching:
min_score_threshold: 0.7 # Minimum score to accept match
context_radius: 100 # Pixels to search for context keywords
# Context keywords for each field (Swedish)
context_keywords:
InvoiceNumber:
- "fakturanr"
- "fakturanummer"
- "invoice"
InvoiceDate:
- "fakturadatum"
- "datum"
InvoiceDueDate:
- "förfallodatum"
- "förfaller"
- "betalas senast"
OCR:
- "ocr"
- "referens"
Bankgiro:
- "bankgiro"
- "bg"
Plusgiro:
- "plusgiro"
- "pg"
Amount:
- "att betala"
- "summa"
- "total"
- "belopp"
# YOLO Training
yolo:
model: yolo26s # Model architecture (yolo26n/s/m/l/x)
epochs: 100
batch_size: 16
img_size: 1280 # Image size for training
# Data augmentation
augmentation:
rotation: 5 # Max rotation degrees
scale: 0.2 # Scale variation
mosaic: 0.0 # Disable mosaic for documents
hsv_h: 0.0 # No hue variation
hsv_s: 0.1 # Slight saturation variation
hsv_v: 0.2 # Brightness variation
# Class definitions
classes:
0: invoice_number
1: invoice_date
2: invoice_due_date
3: ocr_number
4: bankgiro
5: plusgiro
6: amount
# Auto-labeling
autolabel:
min_confidence: 0.7 # Minimum score to include in training
bbox_padding: 0.02 # Padding around bboxes (fraction of image)
# Dataset Split
dataset:
train_ratio: 0.8
val_ratio: 0.1
test_ratio: 0.1
random_seed: 42
# Inference
inference:
confidence_threshold: 0.5 # Detection confidence threshold
iou_threshold: 0.45 # NMS IOU threshold
enable_fallback: true # Enable regex fallback if YOLO fails
fallback_min_missing: 2 # Min missing fields to trigger fallback
# Paths (relative to project root)
paths:
raw_pdfs: data/raw_pdfs
images: data/images
labels: data/labels
structured_data: data/structured_data
models: models
reports: reports