Initial commit: Invoice field extraction system using YOLO + OCR
Features: - Auto-labeling pipeline: CSV values -> PDF search -> YOLO annotations - Flexible date matching: year-month match, nearby date tolerance - PDF text extraction with PyMuPDF - OCR support for scanned documents (PaddleOCR) - YOLO training and inference pipeline - 7 field types: InvoiceNumber, InvoiceDate, InvoiceDueDate, OCR, Bankgiro, Plusgiro, Amount Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
129
configs/default.yaml
Normal file
129
configs/default.yaml
Normal file
@@ -0,0 +1,129 @@
|
||||
# Invoice Master POC v2 Configuration
|
||||
# Default configuration for invoice field extraction system
|
||||
|
||||
# PDF Processing
|
||||
pdf:
|
||||
dpi: 300 # Resolution for rendering PDFs to images
|
||||
min_text_chars: 30 # Minimum chars to consider PDF as text-based
|
||||
|
||||
# OCR Settings
|
||||
ocr:
|
||||
engine: paddleocr # OCR engine to use
|
||||
lang: en # Language code (en, sv, ch, etc.)
|
||||
use_gpu: false # Enable GPU acceleration
|
||||
|
||||
# Field Normalization
|
||||
normalize:
|
||||
# Bankgiro formats
|
||||
bankgiro:
|
||||
format: "XXXX-XXXX" # Standard 8-digit format
|
||||
alternatives:
|
||||
- "XXX-XXXX" # 7-digit format
|
||||
|
||||
# Plusgiro formats
|
||||
plusgiro:
|
||||
format: "XXXXXXX-X" # Standard format with check digit
|
||||
|
||||
# Amount formats
|
||||
amount:
|
||||
decimal_separator: "," # Swedish uses comma
|
||||
thousand_separator: " " # Space for thousands
|
||||
currency_symbols:
|
||||
- "SEK"
|
||||
- "kr"
|
||||
|
||||
# Date formats
|
||||
date:
|
||||
output_format: "%Y-%m-%d"
|
||||
input_formats:
|
||||
- "%Y-%m-%d"
|
||||
- "%Y-%m-%d %H:%M:%S"
|
||||
- "%d/%m/%Y"
|
||||
- "%d.%m.%Y"
|
||||
|
||||
# Field Matching
|
||||
matching:
|
||||
min_score_threshold: 0.7 # Minimum score to accept match
|
||||
context_radius: 100 # Pixels to search for context keywords
|
||||
|
||||
# Context keywords for each field (Swedish)
|
||||
context_keywords:
|
||||
InvoiceNumber:
|
||||
- "fakturanr"
|
||||
- "fakturanummer"
|
||||
- "invoice"
|
||||
InvoiceDate:
|
||||
- "fakturadatum"
|
||||
- "datum"
|
||||
InvoiceDueDate:
|
||||
- "förfallodatum"
|
||||
- "förfaller"
|
||||
- "betalas senast"
|
||||
OCR:
|
||||
- "ocr"
|
||||
- "referens"
|
||||
Bankgiro:
|
||||
- "bankgiro"
|
||||
- "bg"
|
||||
Plusgiro:
|
||||
- "plusgiro"
|
||||
- "pg"
|
||||
Amount:
|
||||
- "att betala"
|
||||
- "summa"
|
||||
- "total"
|
||||
- "belopp"
|
||||
|
||||
# YOLO Training
|
||||
yolo:
|
||||
model: yolov8s # Model architecture (yolov8n/s/m/l/x)
|
||||
epochs: 100
|
||||
batch_size: 16
|
||||
img_size: 1280 # Image size for training
|
||||
|
||||
# Data augmentation
|
||||
augmentation:
|
||||
rotation: 5 # Max rotation degrees
|
||||
scale: 0.2 # Scale variation
|
||||
mosaic: 0.0 # Disable mosaic for documents
|
||||
hsv_h: 0.0 # No hue variation
|
||||
hsv_s: 0.1 # Slight saturation variation
|
||||
hsv_v: 0.2 # Brightness variation
|
||||
|
||||
# Class definitions
|
||||
classes:
|
||||
0: invoice_number
|
||||
1: invoice_date
|
||||
2: invoice_due_date
|
||||
3: ocr_number
|
||||
4: bankgiro
|
||||
5: plusgiro
|
||||
6: amount
|
||||
|
||||
# Auto-labeling
|
||||
autolabel:
|
||||
min_confidence: 0.7 # Minimum score to include in training
|
||||
bbox_padding: 0.02 # Padding around bboxes (fraction of image)
|
||||
|
||||
# Dataset Split
|
||||
dataset:
|
||||
train_ratio: 0.8
|
||||
val_ratio: 0.1
|
||||
test_ratio: 0.1
|
||||
random_seed: 42
|
||||
|
||||
# Inference
|
||||
inference:
|
||||
confidence_threshold: 0.5 # Detection confidence threshold
|
||||
iou_threshold: 0.45 # NMS IOU threshold
|
||||
enable_fallback: true # Enable regex fallback if YOLO fails
|
||||
fallback_min_missing: 2 # Min missing fields to trigger fallback
|
||||
|
||||
# Paths (relative to project root)
|
||||
paths:
|
||||
raw_pdfs: data/raw_pdfs
|
||||
images: data/images
|
||||
labels: data/labels
|
||||
structured_data: data/structured_data
|
||||
models: models
|
||||
reports: reports
|
||||
Reference in New Issue
Block a user