Initial commit: Invoice field extraction system using YOLO + OCR

Features:
- Auto-labeling pipeline: CSV values -> PDF search -> YOLO annotations
- Flexible date matching: year-month match, nearby date tolerance
- PDF text extraction with PyMuPDF
- OCR support for scanned documents (PaddleOCR)
- YOLO training and inference pipeline
- 7 field types: InvoiceNumber, InvoiceDate, InvoiceDueDate, OCR, Bankgiro, Plusgiro, Amount

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Yaojia Wang
2026-01-10 17:44:14 +01:00
commit 8938661850
35 changed files with 5020 additions and 0 deletions

306
src/data/csv_loader.py Normal file
View File

@@ -0,0 +1,306 @@
"""
CSV Data Loader
Loads and parses structured invoice data from CSV files.
Follows the CSV specification for invoice data.
"""
import csv
from dataclasses import dataclass, field
from datetime import datetime, date
from decimal import Decimal, InvalidOperation
from pathlib import Path
from typing import Any, Iterator
@dataclass
class InvoiceRow:
"""Parsed invoice data row."""
DocumentId: str
InvoiceDate: date | None = None
InvoiceNumber: str | None = None
InvoiceDueDate: date | None = None
OCR: str | None = None
Message: str | None = None
Bankgiro: str | None = None
Plusgiro: str | None = None
Amount: Decimal | None = None
# Raw values for reference
raw_data: dict = field(default_factory=dict)
def to_dict(self) -> dict[str, Any]:
"""Convert to dictionary for matching."""
return {
'DocumentId': self.DocumentId,
'InvoiceDate': self.InvoiceDate.isoformat() if self.InvoiceDate else None,
'InvoiceNumber': self.InvoiceNumber,
'InvoiceDueDate': self.InvoiceDueDate.isoformat() if self.InvoiceDueDate else None,
'OCR': self.OCR,
'Bankgiro': self.Bankgiro,
'Plusgiro': self.Plusgiro,
'Amount': str(self.Amount) if self.Amount else None,
}
def get_field_value(self, field_name: str) -> str | None:
"""Get field value as string for matching."""
value = getattr(self, field_name, None)
if value is None:
return None
if isinstance(value, date):
return value.isoformat()
if isinstance(value, Decimal):
return str(value)
return str(value) if value else None
class CSVLoader:
"""Loads invoice data from CSV files."""
# Expected field mappings (CSV header -> InvoiceRow attribute)
FIELD_MAPPINGS = {
'DocumentId': 'DocumentId',
'InvoiceDate': 'InvoiceDate',
'InvoiceNumber': 'InvoiceNumber',
'InvoiceDueDate': 'InvoiceDueDate',
'OCR': 'OCR',
'Message': 'Message',
'Bankgiro': 'Bankgiro',
'Plusgiro': 'Plusgiro',
'Amount': 'Amount',
}
def __init__(
self,
csv_path: str | Path,
pdf_dir: str | Path | None = None,
doc_map_path: str | Path | None = None,
encoding: str = 'utf-8'
):
"""
Initialize CSV loader.
Args:
csv_path: Path to the CSV file
pdf_dir: Directory containing PDF files (default: data/raw_pdfs)
doc_map_path: Optional path to document mapping CSV
encoding: CSV file encoding (default: utf-8)
"""
self.csv_path = Path(csv_path)
self.pdf_dir = Path(pdf_dir) if pdf_dir else self.csv_path.parent.parent / 'raw_pdfs'
self.doc_map_path = Path(doc_map_path) if doc_map_path else None
self.encoding = encoding
# Load document mapping if provided
self.doc_map = self._load_doc_map() if self.doc_map_path else {}
def _load_doc_map(self) -> dict[str, str]:
"""Load document ID to filename mapping."""
mapping = {}
if self.doc_map_path and self.doc_map_path.exists():
with open(self.doc_map_path, 'r', encoding=self.encoding) as f:
reader = csv.DictReader(f)
for row in reader:
doc_id = row.get('DocumentId', '').strip()
filename = row.get('FileName', '').strip()
if doc_id and filename:
mapping[doc_id] = filename
return mapping
def _parse_date(self, value: str | None) -> date | None:
"""Parse date from various formats."""
if not value or not value.strip():
return None
value = value.strip()
# Try different date formats
formats = [
'%Y-%m-%d',
'%Y-%m-%d %H:%M:%S',
'%Y-%m-%d %H:%M:%S.%f',
'%d/%m/%Y',
'%d.%m.%Y',
'%d-%m-%Y',
'%Y%m%d',
]
for fmt in formats:
try:
return datetime.strptime(value, fmt).date()
except ValueError:
continue
return None
def _parse_amount(self, value: str | None) -> Decimal | None:
"""Parse monetary amount from various formats."""
if not value or not value.strip():
return None
value = value.strip()
# Remove currency symbols and common suffixes
value = value.replace('SEK', '').replace('kr', '').replace(':-', '')
value = value.strip()
# Remove spaces (thousand separators)
value = value.replace(' ', '').replace('\xa0', '')
# Handle comma as decimal separator (European format)
if ',' in value and '.' not in value:
value = value.replace(',', '.')
elif ',' in value and '.' in value:
# Assume comma is thousands separator, dot is decimal
value = value.replace(',', '')
try:
return Decimal(value)
except InvalidOperation:
return None
def _parse_string(self, value: str | None) -> str | None:
"""Parse string field with cleanup."""
if value is None:
return None
value = value.strip()
return value if value else None
def _parse_row(self, row: dict) -> InvoiceRow | None:
"""Parse a single CSV row into InvoiceRow."""
doc_id = self._parse_string(row.get('DocumentId'))
if not doc_id:
return None
return InvoiceRow(
DocumentId=doc_id,
InvoiceDate=self._parse_date(row.get('InvoiceDate')),
InvoiceNumber=self._parse_string(row.get('InvoiceNumber')),
InvoiceDueDate=self._parse_date(row.get('InvoiceDueDate')),
OCR=self._parse_string(row.get('OCR')),
Message=self._parse_string(row.get('Message')),
Bankgiro=self._parse_string(row.get('Bankgiro')),
Plusgiro=self._parse_string(row.get('Plusgiro')),
Amount=self._parse_amount(row.get('Amount')),
raw_data=dict(row)
)
def load_all(self) -> list[InvoiceRow]:
"""Load all rows from CSV."""
rows = []
for row in self.iter_rows():
rows.append(row)
return rows
def iter_rows(self) -> Iterator[InvoiceRow]:
"""Iterate over CSV rows."""
# Handle BOM - try utf-8-sig first to handle BOM correctly
encodings = ['utf-8-sig', self.encoding, 'latin-1']
for enc in encodings:
try:
with open(self.csv_path, 'r', encoding=enc) as f:
reader = csv.DictReader(f)
for row in reader:
parsed = self._parse_row(row)
if parsed:
yield parsed
return
except UnicodeDecodeError:
continue
raise ValueError(f"Could not read CSV file with any supported encoding")
def get_pdf_path(self, invoice_row: InvoiceRow) -> Path | None:
"""
Get PDF path for an invoice row.
Uses document mapping if available, otherwise assumes
DocumentId.pdf naming convention.
"""
doc_id = invoice_row.DocumentId
# Check document mapping first
if doc_id in self.doc_map:
filename = self.doc_map[doc_id]
pdf_path = self.pdf_dir / filename
if pdf_path.exists():
return pdf_path
# Try default naming patterns
patterns = [
f"{doc_id}.pdf",
f"{doc_id.lower()}.pdf",
f"{doc_id.upper()}.pdf",
]
for pattern in patterns:
pdf_path = self.pdf_dir / pattern
if pdf_path.exists():
return pdf_path
# Try glob patterns for partial matches
for pdf_file in self.pdf_dir.glob(f"*{doc_id}*.pdf"):
return pdf_file
return None
def get_row_by_id(self, doc_id: str) -> InvoiceRow | None:
"""Get a specific row by DocumentId."""
for row in self.iter_rows():
if row.DocumentId == doc_id:
return row
return None
def validate(self) -> list[dict]:
"""
Validate CSV data and return issues.
Returns:
List of validation issues
"""
issues = []
for i, row in enumerate(self.iter_rows(), start=2): # Start at 2 (header is row 1)
# Check required DocumentId
if not row.DocumentId:
issues.append({
'row': i,
'field': 'DocumentId',
'issue': 'Missing required DocumentId'
})
continue
# Check if PDF exists
pdf_path = self.get_pdf_path(row)
if not pdf_path:
issues.append({
'row': i,
'doc_id': row.DocumentId,
'field': 'PDF',
'issue': 'PDF file not found'
})
# Check for at least one matchable field
matchable_fields = [
row.InvoiceNumber,
row.OCR,
row.Bankgiro,
row.Plusgiro,
row.Amount
]
if not any(matchable_fields):
issues.append({
'row': i,
'doc_id': row.DocumentId,
'field': 'All',
'issue': 'No matchable fields (InvoiceNumber/OCR/Bankgiro/Plusgiro/Amount)'
})
return issues
def load_invoice_csv(csv_path: str | Path, pdf_dir: str | Path | None = None) -> list[InvoiceRow]:
"""Convenience function to load invoice CSV."""
loader = CSVLoader(csv_path, pdf_dir)
return loader.load_all()