Initial commit: Invoice field extraction system using YOLO + OCR

Features: - Auto-labeling pipeline: CSV values -> PDF search -> YOLO annotations - Flexible date matching: year-month match, nearby date tolerance - PDF text extraction with PyMuPDF - OCR support for scanned documents (PaddleOCR) - YOLO training and inference pipeline - 7 field types: InvoiceNumber, InvoiceDate, InvoiceDueDate, OCR, Bankgiro, Plusgiro, Amount Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-10 17:44:14 +01:00
commit 8938661850
35 changed files with 5020 additions and 0 deletions
--- a/src/normalize/normalizer.py
+++ b/src/normalize/normalizer.py
@@ -0,0 +1,290 @@
+"""
+Field Normalization Module
+
+Normalizes field values to generate multiple candidate forms for matching.
+"""
+
+import re
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Callable
+
+
+@dataclass
+class NormalizedValue:
+    """Represents a normalized value with its variants."""
+    original: str
+    variants: list[str]
+    field_type: str
+
+
+class FieldNormalizer:
+    """Handles normalization of different invoice field types."""
+
+    # Common Swedish month names for date parsing
+    SWEDISH_MONTHS = {
+        'januari': '01', 'jan': '01',
+        'februari': '02', 'feb': '02',
+        'mars': '03', 'mar': '03',
+        'april': '04', 'apr': '04',
+        'maj': '05',
+        'juni': '06', 'jun': '06',
+        'juli': '07', 'jul': '07',
+        'augusti': '08', 'aug': '08',
+        'september': '09', 'sep': '09', 'sept': '09',
+        'oktober': '10', 'okt': '10',
+        'november': '11', 'nov': '11',
+        'december': '12', 'dec': '12'
+    }
+
+    @staticmethod
+    def clean_text(text: str) -> str:
+        """Remove invisible characters and normalize whitespace."""
+        # Remove zero-width characters
+        text = re.sub(r'[\u200b\u200c\u200d\ufeff]', '', text)
+        # Normalize whitespace
+        text = ' '.join(text.split())
+        return text.strip()
+
+    @staticmethod
+    def normalize_invoice_number(value: str) -> list[str]:
+        """
+        Normalize invoice number.
+        Keeps only digits for matching.
+
+        Examples:
+            '100017500321' -> ['100017500321']
+            'INV-100017500321' -> ['100017500321', 'INV-100017500321']
+        """
+        value = FieldNormalizer.clean_text(value)
+        digits_only = re.sub(r'\D', '', value)
+
+        variants = [value]
+        if digits_only and digits_only != value:
+            variants.append(digits_only)
+
+        return list(set(v for v in variants if v))
+
+    @staticmethod
+    def normalize_ocr_number(value: str) -> list[str]:
+        """
+        Normalize OCR number (Swedish payment reference).
+        Similar to invoice number - digits only.
+        """
+        return FieldNormalizer.normalize_invoice_number(value)
+
+    @staticmethod
+    def normalize_bankgiro(value: str) -> list[str]:
+        """
+        Normalize Bankgiro number.
+
+        Examples:
+            '5393-9484' -> ['5393-9484', '53939484']
+            '53939484' -> ['53939484', '5393-9484']
+        """
+        value = FieldNormalizer.clean_text(value)
+        digits_only = re.sub(r'\D', '', value)
+
+        variants = [value]
+
+        if digits_only:
+            # Add without dash
+            variants.append(digits_only)
+
+            # Add with dash (format: XXXX-XXXX for 8 digits)
+            if len(digits_only) == 8:
+                with_dash = f"{digits_only[:4]}-{digits_only[4:]}"
+                variants.append(with_dash)
+            elif len(digits_only) == 7:
+                # Some bankgiro numbers are 7 digits: XXX-XXXX
+                with_dash = f"{digits_only[:3]}-{digits_only[3:]}"
+                variants.append(with_dash)
+
+        return list(set(v for v in variants if v))
+
+    @staticmethod
+    def normalize_plusgiro(value: str) -> list[str]:
+        """
+        Normalize Plusgiro number.
+
+        Examples:
+            '1234567-8' -> ['1234567-8', '12345678']
+            '12345678' -> ['12345678', '1234567-8']
+        """
+        value = FieldNormalizer.clean_text(value)
+        digits_only = re.sub(r'\D', '', value)
+
+        variants = [value]
+
+        if digits_only:
+            variants.append(digits_only)
+
+            # Plusgiro format: XXXXXXX-X (7 digits + check digit)
+            if len(digits_only) == 8:
+                with_dash = f"{digits_only[:-1]}-{digits_only[-1]}"
+                variants.append(with_dash)
+            # Also handle 6+1 format
+            elif len(digits_only) == 7:
+                with_dash = f"{digits_only[:-1]}-{digits_only[-1]}"
+                variants.append(with_dash)
+
+        return list(set(v for v in variants if v))
+
+    @staticmethod
+    def normalize_amount(value: str) -> list[str]:
+        """
+        Normalize monetary amount.
+
+        Examples:
+            '114' -> ['114', '114,00', '114.00']
+            '114,00' -> ['114,00', '114.00', '114']
+            '1 234,56' -> ['1234,56', '1234.56', '1 234,56']
+        """
+        value = FieldNormalizer.clean_text(value)
+
+        # Remove currency symbols and common suffixes
+        value = re.sub(r'[SEK|kr|:-]+$', '', value, flags=re.IGNORECASE).strip()
+
+        # Remove spaces (thousand separators)
+        no_space = value.replace(' ', '').replace('\xa0', '')
+
+        variants = [value]
+
+        # Normalize decimal separator
+        if ',' in no_space:
+            dot_version = no_space.replace(',', '.')
+            variants.append(no_space)
+            variants.append(dot_version)
+        elif '.' in no_space:
+            comma_version = no_space.replace('.', ',')
+            variants.append(no_space)
+            variants.append(comma_version)
+        else:
+            # Integer amount - add decimal versions
+            variants.append(no_space)
+            variants.append(f"{no_space},00")
+            variants.append(f"{no_space}.00")
+
+        # Try to parse and get clean numeric value
+        try:
+            # Parse as float
+            clean = no_space.replace(',', '.')
+            num = float(clean)
+
+            # Integer if no decimals
+            if num == int(num):
+                variants.append(str(int(num)))
+                variants.append(f"{int(num)},00")
+                variants.append(f"{int(num)}.00")
+            else:
+                variants.append(f"{num:.2f}")
+                variants.append(f"{num:.2f}".replace('.', ','))
+        except ValueError:
+            pass
+
+        return list(set(v for v in variants if v))
+
+    @staticmethod
+    def normalize_date(value: str) -> list[str]:
+        """
+        Normalize date to YYYY-MM-DD and generate variants.
+
+        Handles:
+            '2025-12-13' -> ['2025-12-13', '13/12/2025', '13.12.2025']
+            '13/12/2025' -> ['2025-12-13', '13/12/2025', ...]
+            '13 december 2025' -> ['2025-12-13', ...]
+        """
+        value = FieldNormalizer.clean_text(value)
+        variants = [value]
+
+        parsed_date = None
+
+        # Try different date formats
+        date_patterns = [
+            # ISO format with optional time (e.g., 2026-01-09 00:00:00)
+            (r'^(\d{4})-(\d{1,2})-(\d{1,2})(?:\s+\d{1,2}:\d{2}:\d{2})?$', lambda m: (int(m[1]), int(m[2]), int(m[3]))),
+            # European format with /
+            (r'^(\d{1,2})/(\d{1,2})/(\d{4})$', lambda m: (int(m[3]), int(m[2]), int(m[1]))),
+            # European format with .
+            (r'^(\d{1,2})\.(\d{1,2})\.(\d{4})$', lambda m: (int(m[3]), int(m[2]), int(m[1]))),
+            # European format with -
+            (r'^(\d{1,2})-(\d{1,2})-(\d{4})$', lambda m: (int(m[3]), int(m[2]), int(m[1]))),
+            # Swedish format: YYMMDD
+            (r'^(\d{2})(\d{2})(\d{2})$', lambda m: (2000 + int(m[1]) if int(m[1]) < 50 else 1900 + int(m[1]), int(m[2]), int(m[3]))),
+            # Swedish format: YYYYMMDD
+            (r'^(\d{4})(\d{2})(\d{2})$', lambda m: (int(m[1]), int(m[2]), int(m[3]))),
+        ]
+
+        for pattern, extractor in date_patterns:
+            match = re.match(pattern, value)
+            if match:
+                try:
+                    year, month, day = extractor(match)
+                    parsed_date = datetime(year, month, day)
+                    break
+                except ValueError:
+                    continue
+
+        # Try Swedish month names
+        if not parsed_date:
+            for month_name, month_num in FieldNormalizer.SWEDISH_MONTHS.items():
+                if month_name in value.lower():
+                    # Extract day and year
+                    numbers = re.findall(r'\d+', value)
+                    if len(numbers) >= 2:
+                        day = int(numbers[0])
+                        year = int(numbers[-1])
+                        if year < 100:
+                            year = 2000 + year if year < 50 else 1900 + year
+                        try:
+                            parsed_date = datetime(year, int(month_num), day)
+                            break
+                        except ValueError:
+                            continue
+
+        if parsed_date:
+            # Generate different formats
+            iso = parsed_date.strftime('%Y-%m-%d')
+            eu_slash = parsed_date.strftime('%d/%m/%Y')
+            eu_dot = parsed_date.strftime('%d.%m.%Y')
+            compact = parsed_date.strftime('%Y%m%d')
+
+            variants.extend([iso, eu_slash, eu_dot, compact])
+
+        return list(set(v for v in variants if v))
+
+
+# Field type to normalizer mapping
+NORMALIZERS: dict[str, Callable[[str], list[str]]] = {
+    'InvoiceNumber': FieldNormalizer.normalize_invoice_number,
+    'OCR': FieldNormalizer.normalize_ocr_number,
+    'Bankgiro': FieldNormalizer.normalize_bankgiro,
+    'Plusgiro': FieldNormalizer.normalize_plusgiro,
+    'Amount': FieldNormalizer.normalize_amount,
+    'InvoiceDate': FieldNormalizer.normalize_date,
+    'InvoiceDueDate': FieldNormalizer.normalize_date,
+}
+
+
+def normalize_field(field_name: str, value: str) -> list[str]:
+    """
+    Normalize a field value based on its type.
+
+    Args:
+        field_name: Name of the field (e.g., 'InvoiceNumber', 'Amount')
+        value: Raw value to normalize
+
+    Returns:
+        List of normalized variants
+    """
+    if value is None or (isinstance(value, str) and not value.strip()):
+        return []
+
+    value = str(value)
+    normalizer = NORMALIZERS.get(field_name)
+
+    if normalizer:
+        return normalizer(value)
+
+    # Default: just clean the text
+    return [FieldNormalizer.clean_text(value)]