invoice-master-poc-v2/packages/backend/backend/pipeline/normalizers/amount.py

"""
Amount Normalizer

Handles normalization and validation of monetary amounts.
"""

import re

from shared.utils.text_cleaner import TextCleaner
from shared.utils.validators import FieldValidators
from shared.utils.ocr_corrections import OCRCorrections

from .base import BaseNormalizer, NormalizationResult


class AmountNormalizer(BaseNormalizer):
    """
    Normalizes monetary amounts from Swedish invoices.

    Handles various Swedish amount formats:
    - With decimal: 1 234,56 kr
    - With SEK suffix: 1234.56 SEK
    - Payment line kronor/ore: 590 00 (space = decimal separator)
    - Multiple amounts (returns the last one, usually the total)
    """

    # Payment line kronor/ore pattern: "590 00" means 590.00 SEK
    # Only matches when no comma/dot is present (pure digit-space-2digit format)
    _KRONOR_ORE_PATTERN = re.compile(r'^(\d+)\s+(\d{2})$')

    @property
    def field_name(self) -> str:
        return "Amount"

    @classmethod
    def _try_kronor_ore(cls, text: str) -> NormalizationResult | None:
        """Try to parse as payment line kronor/ore format.

        Swedish payment lines separate kronor and ore with a space:
        "590 00" = 590.00 SEK, "15658 00" = 15658.00 SEK

        Only applies when text has no comma or dot (otherwise it's
        a normal amount format with explicit decimal separator).

        Returns NormalizationResult on success, None if not matched.
        """
        if ',' in text or '.' in text:
            return None

        match = cls._KRONOR_ORE_PATTERN.match(text.strip())
        if not match:
            return None

        kronor = match.group(1)
        ore = match.group(2)
        try:
            amount = float(f"{kronor}.{ore}")
            if amount > 0:
                return NormalizationResult.success(f"{amount:.2f}")
        except ValueError:
            pass
        return None

    @staticmethod
    def _parse_amount_str(match: str) -> float | None:
        """Convert matched amount string to float, detecting European vs Anglo format.

        European: 2.254,50 -> 2254.50 (dot=thousand, comma=decimal)
        Anglo: 1,234.56 -> 1234.56 (comma=thousand, dot=decimal)
        Swedish: 1 234,56 -> 1234.56 (space=thousand, comma=decimal)
        """
        has_comma = ',' in match
        has_dot = '.' in match
        if has_comma and has_dot:
            if match.rfind(',') > match.rfind('.'):
                # European: 2.254,50
                cleaned = match.replace(" ", "").replace(".", "").replace(",", ".")
            else:
                # Anglo: 1,234.56
                cleaned = match.replace(" ", "").replace(",", "")
        elif has_comma:
            cleaned = match.replace(" ", "").replace(",", ".")
        else:
            cleaned = match.replace(" ", "")
        try:
            return float(cleaned)
        except ValueError:
            return None

    def normalize(self, text: str) -> NormalizationResult:
        text = text.strip()
        if not text:
            return NormalizationResult.failure("Empty text")

        # Early check: payment line kronor/ore format ("590 00" → 590.00)
        kronor_ore_result = self._try_kronor_ore(text)
        if kronor_ore_result is not None:
            return kronor_ore_result

        # Split by newlines and process line by line to get the last valid amount
        lines = text.split("\n")

        # Collect all valid amounts from all lines
        all_amounts: list[float] = []

        # Separate patterns for European and Anglo formats
        # (?!\d) lookahead prevents partial matches (e.g. "1,23" in "1,234.56")
        # European: dot=thousand, comma=decimal (2.254,50 or 1 234,56)
        # Anglo: comma=thousand, dot=decimal (1,234.56 or 1234.56)
        amount_pattern = (
            r"(\d[\d\s.]*,\d{2})(?!\d)\s*(?:kr|SEK)?"
            r"|"
            r"(\d[\d\s,]*\.\d{2})(?!\d)\s*(?:kr|SEK)?"
        )

        for line in lines:
            line = line.strip()
            if not line:
                continue

            # Find all amounts in this line
            for m in re.finditer(amount_pattern, line, re.IGNORECASE):
                match = m.group(1) or m.group(2)
                if not match:
                    continue
                amount = self._parse_amount_str(match)
                if amount is not None and amount > 0:
                    all_amounts.append(amount)

        # Return the last amount found (usually the total)
        if all_amounts:
            return NormalizationResult.success(f"{all_amounts[-1]:.2f}")

        # Fallback: try shared validator on cleaned text
        cleaned = TextCleaner.normalize_amount_text(text)
        amount = FieldValidators.parse_amount(cleaned)
        if amount is not None and amount > 0:
            return NormalizationResult.success(f"{amount:.2f}")

        # Try to find any decimal number
        simple_pattern = r"(\d+[,\.]\d{2})"
        matches = re.findall(simple_pattern, text)
        if matches:
            amount_str = matches[-1].replace(",", ".")
            try:
                amount = float(amount_str)
                if amount > 0:
                    return NormalizationResult.success(f"{amount:.2f}")
            except ValueError:
                pass

        # Last resort: try to find integer amount (no decimals)
        # Look for patterns like "Amount: 11699" or standalone numbers
        int_pattern = r"(?:amount|belopp|summa|total)[:\s]*(\d+)"
        match = re.search(int_pattern, text, re.IGNORECASE)
        if match:
            try:
                amount = float(match.group(1))
                if amount > 0:
                    return NormalizationResult.success(f"{amount:.2f}")
            except ValueError:
                pass

        # Very last resort: find any standalone number >= 3 digits
        standalone_pattern = r"\b(\d{3,})\b"
        matches = re.findall(standalone_pattern, text)
        if matches:
            # Take the last/largest number
            try:
                amount = float(matches[-1])
                if amount > 0:
                    return NormalizationResult.success(f"{amount:.2f}")
            except ValueError:
                pass

        return NormalizationResult.failure(f"Cannot parse amount: {text}")


class EnhancedAmountNormalizer(AmountNormalizer):
    """
    Enhanced amount parsing with multiple strategies.

    Strategies:
    1. Pattern matching for Swedish formats
    2. Context-aware extraction (look for keywords like "Total", "Summa")
    3. OCR error correction for common digit errors
    4. Multi-amount handling (prefer last/largest as total)
    """

    def normalize(self, text: str) -> NormalizationResult:
        text = text.strip()
        if not text:
            return NormalizationResult.failure("Empty text")

        # Early check: payment line kronor/ore format ("590 00" → 590.00)
        kronor_ore_result = self._try_kronor_ore(text)
        if kronor_ore_result is not None:
            return kronor_ore_result

        # Strategy 1: Apply OCR corrections first
        corrected_text = OCRCorrections.correct_digits(text, aggressive=False).corrected

        # Strategy 2: Look for labeled amounts (highest priority)
        # Use two capture groups: group(1) = European, group(2) = Anglo
        labeled_patterns = [
            # Swedish patterns ((?!\d) prevents partial matches like "1,23" in "1,234.56")
            (r"(?:att\s+betala|summa|total|belopp)\s*[:\s]*(\d[\d\s.]*,\d{2}(?!\d)|\d[\d\s,]*\.\d{2}(?!\d))", 1.0),
            (
                r"(?:moms|vat)\s*[:\s]*(\d[\d\s.]*,\d{2}(?!\d)|\d[\d\s,]*\.\d{2}(?!\d))",
                0.8,
            ),  # Lower priority for VAT
            # Generic pattern
            (r"(\d[\d\s.]*,\d{2}(?!\d)|\d[\d\s,]*\.\d{2}(?!\d))\s*(?:kr|sek|kronor)?", 0.7),
        ]

        candidates: list[tuple[float, float, int]] = []
        for pattern, priority in labeled_patterns:
            for match in re.finditer(pattern, corrected_text, re.IGNORECASE):
                amount = self._parse_amount_str(match.group(1))
                if amount is not None and 0 < amount < 10_000_000:
                    candidates.append((amount, priority, match.start()))

        if candidates:
            # Sort by priority (desc), then by position (later is usually total)
            candidates.sort(key=lambda x: (-x[1], -x[2]))
            best_amount = candidates[0][0]
            return NormalizationResult.success(f"{best_amount:.2f}")

        # Strategy 3: Parse with shared validator
        cleaned = TextCleaner.normalize_amount_text(corrected_text)
        amount = FieldValidators.parse_amount(cleaned)
        if amount is not None and 0 < amount < 10_000_000:
            return NormalizationResult.success(f"{amount:.2f}")

        # Strategy 4: Try to extract any decimal number as fallback
        decimal_pattern = r"(\d{1,3}(?:[\s\.]?\d{3})*[,\.]\d{2})"
        matches = re.findall(decimal_pattern, corrected_text)
        if matches:
            # Clean and parse each match
            amounts: list[float] = []
            for m in matches:
                cleaned_m = m.replace(" ", "").replace(".", "").replace(",", ".")
                # Handle Swedish format: "1 234,56" -> "1234.56"
                if "," in m and "." not in m:
                    cleaned_m = m.replace(" ", "").replace(",", ".")
                try:
                    amt = float(cleaned_m)
                    if 0 < amt < 10_000_000:
                        amounts.append(amt)
                except ValueError:
                    continue

            if amounts:
                # Return the last/largest amount (usually the total)
                return NormalizationResult.success(f"{max(amounts):.2f}")

        return NormalizationResult.failure(f"Cannot parse amount: {text[:50]}")