Initial commit: Invoice field extraction system using YOLO + OCR

Features: - Auto-labeling pipeline: CSV values -> PDF search -> YOLO annotations - Flexible date matching: year-month match, nearby date tolerance - PDF text extraction with PyMuPDF - OCR support for scanned documents (PaddleOCR) - YOLO training and inference pipeline - 7 field types: InvoiceNumber, InvoiceDate, InvoiceDueDate, OCR, Bankgiro, Plusgiro, Amount Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-10 17:44:14 +01:00
commit 8938661850
35 changed files with 5020 additions and 0 deletions
--- a/src/data/csv_loader.py
+++ b/src/data/csv_loader.py
@@ -0,0 +1,306 @@
+"""
+CSV Data Loader
+
+Loads and parses structured invoice data from CSV files.
+Follows the CSV specification for invoice data.
+"""
+
+import csv
+from dataclasses import dataclass, field
+from datetime import datetime, date
+from decimal import Decimal, InvalidOperation
+from pathlib import Path
+from typing import Any, Iterator
+
+
+@dataclass
+class InvoiceRow:
+    """Parsed invoice data row."""
+    DocumentId: str
+    InvoiceDate: date | None = None
+    InvoiceNumber: str | None = None
+    InvoiceDueDate: date | None = None
+    OCR: str | None = None
+    Message: str | None = None
+    Bankgiro: str | None = None
+    Plusgiro: str | None = None
+    Amount: Decimal | None = None
+
+    # Raw values for reference
+    raw_data: dict = field(default_factory=dict)
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for matching."""
+        return {
+            'DocumentId': self.DocumentId,
+            'InvoiceDate': self.InvoiceDate.isoformat() if self.InvoiceDate else None,
+            'InvoiceNumber': self.InvoiceNumber,
+            'InvoiceDueDate': self.InvoiceDueDate.isoformat() if self.InvoiceDueDate else None,
+            'OCR': self.OCR,
+            'Bankgiro': self.Bankgiro,
+            'Plusgiro': self.Plusgiro,
+            'Amount': str(self.Amount) if self.Amount else None,
+        }
+
+    def get_field_value(self, field_name: str) -> str | None:
+        """Get field value as string for matching."""
+        value = getattr(self, field_name, None)
+        if value is None:
+            return None
+        if isinstance(value, date):
+            return value.isoformat()
+        if isinstance(value, Decimal):
+            return str(value)
+        return str(value) if value else None
+
+
+class CSVLoader:
+    """Loads invoice data from CSV files."""
+
+    # Expected field mappings (CSV header -> InvoiceRow attribute)
+    FIELD_MAPPINGS = {
+        'DocumentId': 'DocumentId',
+        'InvoiceDate': 'InvoiceDate',
+        'InvoiceNumber': 'InvoiceNumber',
+        'InvoiceDueDate': 'InvoiceDueDate',
+        'OCR': 'OCR',
+        'Message': 'Message',
+        'Bankgiro': 'Bankgiro',
+        'Plusgiro': 'Plusgiro',
+        'Amount': 'Amount',
+    }
+
+    def __init__(
+        self,
+        csv_path: str | Path,
+        pdf_dir: str | Path | None = None,
+        doc_map_path: str | Path | None = None,
+        encoding: str = 'utf-8'
+    ):
+        """
+        Initialize CSV loader.
+
+        Args:
+            csv_path: Path to the CSV file
+            pdf_dir: Directory containing PDF files (default: data/raw_pdfs)
+            doc_map_path: Optional path to document mapping CSV
+            encoding: CSV file encoding (default: utf-8)
+        """
+        self.csv_path = Path(csv_path)
+        self.pdf_dir = Path(pdf_dir) if pdf_dir else self.csv_path.parent.parent / 'raw_pdfs'
+        self.doc_map_path = Path(doc_map_path) if doc_map_path else None
+        self.encoding = encoding
+
+        # Load document mapping if provided
+        self.doc_map = self._load_doc_map() if self.doc_map_path else {}
+
+    def _load_doc_map(self) -> dict[str, str]:
+        """Load document ID to filename mapping."""
+        mapping = {}
+        if self.doc_map_path and self.doc_map_path.exists():
+            with open(self.doc_map_path, 'r', encoding=self.encoding) as f:
+                reader = csv.DictReader(f)
+                for row in reader:
+                    doc_id = row.get('DocumentId', '').strip()
+                    filename = row.get('FileName', '').strip()
+                    if doc_id and filename:
+                        mapping[doc_id] = filename
+        return mapping
+
+    def _parse_date(self, value: str | None) -> date | None:
+        """Parse date from various formats."""
+        if not value or not value.strip():
+            return None
+
+        value = value.strip()
+
+        # Try different date formats
+        formats = [
+            '%Y-%m-%d',
+            '%Y-%m-%d %H:%M:%S',
+            '%Y-%m-%d %H:%M:%S.%f',
+            '%d/%m/%Y',
+            '%d.%m.%Y',
+            '%d-%m-%Y',
+            '%Y%m%d',
+        ]
+
+        for fmt in formats:
+            try:
+                return datetime.strptime(value, fmt).date()
+            except ValueError:
+                continue
+
+        return None
+
+    def _parse_amount(self, value: str | None) -> Decimal | None:
+        """Parse monetary amount from various formats."""
+        if not value or not value.strip():
+            return None
+
+        value = value.strip()
+
+        # Remove currency symbols and common suffixes
+        value = value.replace('SEK', '').replace('kr', '').replace(':-', '')
+        value = value.strip()
+
+        # Remove spaces (thousand separators)
+        value = value.replace(' ', '').replace('\xa0', '')
+
+        # Handle comma as decimal separator (European format)
+        if ',' in value and '.' not in value:
+            value = value.replace(',', '.')
+        elif ',' in value and '.' in value:
+            # Assume comma is thousands separator, dot is decimal
+            value = value.replace(',', '')
+
+        try:
+            return Decimal(value)
+        except InvalidOperation:
+            return None
+
+    def _parse_string(self, value: str | None) -> str | None:
+        """Parse string field with cleanup."""
+        if value is None:
+            return None
+        value = value.strip()
+        return value if value else None
+
+    def _parse_row(self, row: dict) -> InvoiceRow | None:
+        """Parse a single CSV row into InvoiceRow."""
+        doc_id = self._parse_string(row.get('DocumentId'))
+        if not doc_id:
+            return None
+
+        return InvoiceRow(
+            DocumentId=doc_id,
+            InvoiceDate=self._parse_date(row.get('InvoiceDate')),
+            InvoiceNumber=self._parse_string(row.get('InvoiceNumber')),
+            InvoiceDueDate=self._parse_date(row.get('InvoiceDueDate')),
+            OCR=self._parse_string(row.get('OCR')),
+            Message=self._parse_string(row.get('Message')),
+            Bankgiro=self._parse_string(row.get('Bankgiro')),
+            Plusgiro=self._parse_string(row.get('Plusgiro')),
+            Amount=self._parse_amount(row.get('Amount')),
+            raw_data=dict(row)
+        )
+
+    def load_all(self) -> list[InvoiceRow]:
+        """Load all rows from CSV."""
+        rows = []
+        for row in self.iter_rows():
+            rows.append(row)
+        return rows
+
+    def iter_rows(self) -> Iterator[InvoiceRow]:
+        """Iterate over CSV rows."""
+        # Handle BOM - try utf-8-sig first to handle BOM correctly
+        encodings = ['utf-8-sig', self.encoding, 'latin-1']
+
+        for enc in encodings:
+            try:
+                with open(self.csv_path, 'r', encoding=enc) as f:
+                    reader = csv.DictReader(f)
+                    for row in reader:
+                        parsed = self._parse_row(row)
+                        if parsed:
+                            yield parsed
+                return
+            except UnicodeDecodeError:
+                continue
+
+        raise ValueError(f"Could not read CSV file with any supported encoding")
+
+    def get_pdf_path(self, invoice_row: InvoiceRow) -> Path | None:
+        """
+        Get PDF path for an invoice row.
+
+        Uses document mapping if available, otherwise assumes
+        DocumentId.pdf naming convention.
+        """
+        doc_id = invoice_row.DocumentId
+
+        # Check document mapping first
+        if doc_id in self.doc_map:
+            filename = self.doc_map[doc_id]
+            pdf_path = self.pdf_dir / filename
+            if pdf_path.exists():
+                return pdf_path
+
+        # Try default naming patterns
+        patterns = [
+            f"{doc_id}.pdf",
+            f"{doc_id.lower()}.pdf",
+            f"{doc_id.upper()}.pdf",
+        ]
+
+        for pattern in patterns:
+            pdf_path = self.pdf_dir / pattern
+            if pdf_path.exists():
+                return pdf_path
+
+        # Try glob patterns for partial matches
+        for pdf_file in self.pdf_dir.glob(f"*{doc_id}*.pdf"):
+            return pdf_file
+
+        return None
+
+    def get_row_by_id(self, doc_id: str) -> InvoiceRow | None:
+        """Get a specific row by DocumentId."""
+        for row in self.iter_rows():
+            if row.DocumentId == doc_id:
+                return row
+        return None
+
+    def validate(self) -> list[dict]:
+        """
+        Validate CSV data and return issues.
+
+        Returns:
+            List of validation issues
+        """
+        issues = []
+
+        for i, row in enumerate(self.iter_rows(), start=2):  # Start at 2 (header is row 1)
+            # Check required DocumentId
+            if not row.DocumentId:
+                issues.append({
+                    'row': i,
+                    'field': 'DocumentId',
+                    'issue': 'Missing required DocumentId'
+                })
+                continue
+
+            # Check if PDF exists
+            pdf_path = self.get_pdf_path(row)
+            if not pdf_path:
+                issues.append({
+                    'row': i,
+                    'doc_id': row.DocumentId,
+                    'field': 'PDF',
+                    'issue': 'PDF file not found'
+                })
+
+            # Check for at least one matchable field
+            matchable_fields = [
+                row.InvoiceNumber,
+                row.OCR,
+                row.Bankgiro,
+                row.Plusgiro,
+                row.Amount
+            ]
+            if not any(matchable_fields):
+                issues.append({
+                    'row': i,
+                    'doc_id': row.DocumentId,
+                    'field': 'All',
+                    'issue': 'No matchable fields (InvoiceNumber/OCR/Bankgiro/Plusgiro/Amount)'
+                })
+
+        return issues
+
+
+def load_invoice_csv(csv_path: str | Path, pdf_dir: str | Path | None = None) -> list[InvoiceRow]:
+    """Convenience function to load invoice CSV."""
+    loader = CSVLoader(csv_path, pdf_dir)
+    return loader.load_all()