# Invoice Master POC v2 Configuration # Default configuration for invoice field extraction system # PDF Processing pdf: dpi: 300 # Resolution for rendering PDFs to images min_text_chars: 30 # Minimum chars to consider PDF as text-based # OCR Settings ocr: engine: paddleocr # OCR engine to use lang: en # Language code (en, sv, ch, etc.) use_gpu: false # Enable GPU acceleration # Field Normalization normalize: # Bankgiro formats bankgiro: format: "XXXX-XXXX" # Standard 8-digit format alternatives: - "XXX-XXXX" # 7-digit format # Plusgiro formats plusgiro: format: "XXXXXXX-X" # Standard format with check digit # Amount formats amount: decimal_separator: "," # Swedish uses comma thousand_separator: " " # Space for thousands currency_symbols: - "SEK" - "kr" # Date formats date: output_format: "%Y-%m-%d" input_formats: - "%Y-%m-%d" - "%Y-%m-%d %H:%M:%S" - "%d/%m/%Y" - "%d.%m.%Y" # Field Matching matching: min_score_threshold: 0.7 # Minimum score to accept match context_radius: 100 # Pixels to search for context keywords # Context keywords for each field (Swedish) context_keywords: InvoiceNumber: - "fakturanr" - "fakturanummer" - "invoice" InvoiceDate: - "fakturadatum" - "datum" InvoiceDueDate: - "förfallodatum" - "förfaller" - "betalas senast" OCR: - "ocr" - "referens" Bankgiro: - "bankgiro" - "bg" Plusgiro: - "plusgiro" - "pg" Amount: - "att betala" - "summa" - "total" - "belopp" # YOLO Training yolo: model: yolov8s # Model architecture (yolov8n/s/m/l/x) epochs: 100 batch_size: 16 img_size: 1280 # Image size for training # Data augmentation augmentation: rotation: 5 # Max rotation degrees scale: 0.2 # Scale variation mosaic: 0.0 # Disable mosaic for documents hsv_h: 0.0 # No hue variation hsv_s: 0.1 # Slight saturation variation hsv_v: 0.2 # Brightness variation # Class definitions classes: 0: invoice_number 1: invoice_date 2: invoice_due_date 3: ocr_number 4: bankgiro 5: plusgiro 6: amount # Auto-labeling autolabel: min_confidence: 0.7 # Minimum score to include in training bbox_padding: 0.02 # Padding around bboxes (fraction of image) # Dataset Split dataset: train_ratio: 0.8 val_ratio: 0.1 test_ratio: 0.1 random_seed: 42 # Inference inference: confidence_threshold: 0.5 # Detection confidence threshold iou_threshold: 0.45 # NMS IOU threshold enable_fallback: true # Enable regex fallback if YOLO fails fallback_min_missing: 2 # Min missing fields to trigger fallback # Paths (relative to project root) paths: raw_pdfs: data/raw_pdfs images: data/images labels: data/labels structured_data: data/structured_data models: models reports: reports