invoice-master-poc-v2/packages/backend/backend/pipeline/normalizers/invoice_number.py

"""
Invoice Number Normalizer

Handles normalization and validation of invoice numbers.
"""

import re

from .base import BaseNormalizer, NormalizationResult


class InvoiceNumberNormalizer(BaseNormalizer):
    """
    Normalizes invoice numbers from Swedish invoices.

    Invoice numbers can be:
    - Pure digits: 12345678
    - Alphanumeric: A3861, INV-2024-001, F12345
    - With separators: 2024/001, 2024-001

    Strategy:
    1. Look for common invoice number patterns
    2. Prefer shorter, more specific matches over long digit sequences
    """

    @property
    def field_name(self) -> str:
        return "InvoiceNumber"

    def normalize(self, text: str) -> NormalizationResult:
        text = text.strip()
        if not text:
            return NormalizationResult.failure("Empty text")

        # Pattern 1: Alphanumeric invoice number (letter + digits or digits + letter)
        # Examples: A3861, F12345, INV001
        alpha_patterns = [
            r"\b([A-Z]{1,3}\d{3,10})\b",  # A3861, INV12345
            r"\b(\d{3,10}[A-Z]{1,3})\b",  # 12345A
            r"\b([A-Z]{2,5}[-/]?\d{3,10})\b",  # INV-12345, FAK12345
        ]

        for pattern in alpha_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return NormalizationResult.success(match.group(1).upper())

        # Pattern 2: Invoice number with year prefix (2024-001, 2024/12345)
        year_pattern = r"\b(20\d{2}[-/]\d{3,8})\b"
        match = re.search(year_pattern, text)
        if match:
            return NormalizationResult.success(match.group(1))

        # Pattern 3: Short digit sequence (3-10 digits) - prefer shorter sequences
        # This avoids capturing long OCR numbers
        digit_sequences = re.findall(r"\b(\d{3,10})\b", text)
        if digit_sequences:
            # Prefer shorter sequences (more likely to be invoice number)
            # Also filter out sequences that look like dates (8 digits starting with 20)
            valid_sequences = []
            for seq in digit_sequences:
                # Skip if it looks like a date (YYYYMMDD)
                if len(seq) == 8 and seq.startswith("20"):
                    continue
                # Skip year-only values (2024, 2025, 2026, etc.)
                if len(seq) == 4 and seq.startswith("20"):
                    continue
                # Skip if too long (likely OCR number)
                if len(seq) > 10:
                    continue
                valid_sequences.append(seq)

            if valid_sequences:
                # Prefer 4-8 digit sequences (typical invoice numbers),
                # then closest to 6 digits within that range.
                # This avoids picking short fragments like "775" from amounts.
                def _score(seq: str) -> tuple[int, int]:
                    length = len(seq)
                    if 4 <= length <= 8:
                        return (1, -abs(length - 6))
                    return (0, -length)

                return NormalizationResult.success(max(valid_sequences, key=_score))

        # Fallback: extract all digits if nothing else works
        digits = re.sub(r"\D", "", text)
        if len(digits) >= 3:
            # Limit to first 15 digits to avoid very long sequences
            return NormalizationResult.success_with_warning(
                digits[:15], "Fallback extraction"
            )

        return NormalizationResult.failure(
            f"Cannot extract invoice number from: {text[:50]}"
        )