invoice-master-poc-v2/packages/backend/backend/vat/vat_extractor.py

"""
VAT Extractor

Extracts VAT (Moms) information from Swedish invoice text using regex patterns.
Supports multiple VAT rates (25%, 12%, 6%, 0%) and various Swedish formats.
"""

from dataclasses import dataclass
import re
from decimal import Decimal, InvalidOperation


@dataclass
class VATBreakdown:
    """Single VAT rate breakdown."""

    rate: float  # 25.0, 12.0, 6.0, 0.0
    base_amount: str | None  # Tax base (excl VAT)
    vat_amount: str  # VAT amount
    source: str  # 'regex' | 'line_items'


@dataclass
class VATSummary:
    """Complete VAT summary."""

    breakdowns: list[VATBreakdown]
    total_excl_vat: str | None
    total_vat: str | None
    total_incl_vat: str | None
    confidence: float


class AmountParser:
    """Parse Swedish and European number formats."""

    # Patterns to clean amount strings
    CURRENCY_PATTERN = re.compile(r"(SEK|kr|:-)\s*", re.IGNORECASE)

    def parse(self, amount_str: str) -> float | None:
        """
        Parse amount string to float.

        Handles:
        - Swedish: 1 234,56
        - European: 1.234,56
        - US: 1,234.56

        Args:
            amount_str: Amount string to parse.

        Returns:
            Parsed float value or None if invalid.
        """
        if not amount_str or not amount_str.strip():
            return None

        # Clean the string
        cleaned = amount_str.strip()

        # Remove currency
        cleaned = self.CURRENCY_PATTERN.sub("", cleaned).strip()
        cleaned = re.sub(r"^SEK\s*", "", cleaned, flags=re.IGNORECASE)

        if not cleaned:
            return None

        # Check for negative
        is_negative = cleaned.startswith("-")
        if is_negative:
            cleaned = cleaned[1:].strip()

        try:
            # Remove spaces (Swedish thousands separator)
            cleaned = cleaned.replace(" ", "")

            # Detect format
            # Swedish/European: comma is decimal separator
            # US: period is decimal separator
            has_comma = "," in cleaned
            has_period = "." in cleaned

            if has_comma and has_period:
                # Both present - check position
                comma_pos = cleaned.rfind(",")
                period_pos = cleaned.rfind(".")

                if comma_pos > period_pos:
                    # European: 1.234,56
                    cleaned = cleaned.replace(".", "")
                    cleaned = cleaned.replace(",", ".")
                else:
                    # US: 1,234.56
                    cleaned = cleaned.replace(",", "")
            elif has_comma:
                # Swedish: 1234,56
                cleaned = cleaned.replace(",", ".")
            # else: US format or integer

            value = float(cleaned)
            return -value if is_negative else value

        except (ValueError, InvalidOperation):
            return None


class VATExtractor:
    """Extract VAT information from invoice text."""

    # VAT extraction patterns
    # Note: Amount pattern uses [^\n] to avoid crossing line boundaries
    VAT_PATTERNS = [
        # Moms 25%: 2 500,00 or Moms 25% 2 500,00
        re.compile(
            r"[Mm]oms\s*(\d+(?:[,\.]\d+)?)\s*%\s*:?\s*([\d ,\.]+?)(?:\s*$|\s+[a-zA-Z])",
            re.MULTILINE,
        ),
        # Varav moms 25% 2 500,00
        re.compile(
            r"[Vv]arav\s+moms\s+(\d+(?:[,\.]\d+)?)\s*%\s*([\d ,\.]+?)(?:\s*$|\s+[a-zA-Z])",
            re.MULTILINE,
        ),
        # 25% moms: 2 500,00 (at line start or after whitespace)
        re.compile(
            r"(?:^|\s)(\d+(?:[,\.]\d+)?)\s*%\s*moms\s*:?\s*([\d ,\.]+?)(?:\s*$|\s+[a-zA-Z])",
            re.MULTILINE,
        ),
        # Moms (25%): 2 500,00
        re.compile(
            r"[Mm]oms\s*\((\d+(?:[,\.]\d+)?)\s*%\)\s*:?\s*([\d ,\.]+?)(?:\s*$|\s+[a-zA-Z])",
            re.MULTILINE,
        ),
    ]

    # Pattern with base amount (Underlag)
    VAT_WITH_BASE_PATTERN = re.compile(
        r"[Mm]oms\s*(\d+(?:[,\.]\d+)?)\s*%\s*:?\s*([\d\s,\.]+)"
        r"(?:.*?[Uu]nderlag\s*([\d\s,\.]+))?",
        re.MULTILINE | re.DOTALL,
    )

    # Total patterns
    TOTAL_EXCL_PATTERN = re.compile(
        r"(?:[Ss]umma|[Tt]otal(?:t)?|[Nn]etto)\s*(?:exkl\.?\s*)?(?:moms)?\s*:?\s*([\d\s,\.]+)",
        re.MULTILINE,
    )
    TOTAL_VAT_PATTERN = re.compile(
        r"(?:[Ss]umma|[Tt]otal(?:t)?)\s*moms\s*:?\s*([\d\s,\.]+)",
        re.MULTILINE,
    )
    TOTAL_INCL_PATTERN = re.compile(
        r"(?:[Ss]umma|[Tt]otal(?:t)?|[Bb]rutto)\s*(?:inkl\.?\s*)?(?:moms|att\s*betala)?\s*:?\s*([\d\s,\.]+)",
        re.MULTILINE,
    )

    def __init__(self):
        self.amount_parser = AmountParser()

    def extract(self, text: str) -> VATSummary:
        """
        Extract VAT information from text.

        Args:
            text: Invoice text (OCR output).

        Returns:
            VATSummary with extracted information.
        """
        if not text or not text.strip():
            return VATSummary(
                breakdowns=[],
                total_excl_vat=None,
                total_vat=None,
                total_incl_vat=None,
                confidence=0.0,
            )

        breakdowns = self._extract_breakdowns(text)
        total_excl = self._extract_total_excl(text)
        total_vat = self._extract_total_vat(text)
        total_incl = self._extract_total_incl(text)

        confidence = self._calculate_confidence(
            breakdowns, total_excl, total_vat, total_incl
        )

        return VATSummary(
            breakdowns=breakdowns,
            total_excl_vat=total_excl,
            total_vat=total_vat,
            total_incl_vat=total_incl,
            confidence=confidence,
        )

    def _extract_breakdowns(self, text: str) -> list[VATBreakdown]:
        """Extract individual VAT rate breakdowns."""
        breakdowns = []
        seen_rates = set()

        # Try pattern with base amount first
        for match in self.VAT_WITH_BASE_PATTERN.finditer(text):
            rate = self._parse_rate(match.group(1))
            vat_amount = self._clean_amount(match.group(2))
            base_amount = (
                self._clean_amount(match.group(3)) if match.group(3) else None
            )

            if rate is not None and vat_amount and rate not in seen_rates:
                seen_rates.add(rate)
                breakdowns.append(
                    VATBreakdown(
                        rate=rate,
                        base_amount=base_amount,
                        vat_amount=vat_amount,
                        source="regex",
                    )
                )

        # Try other patterns
        for pattern in self.VAT_PATTERNS:
            for match in pattern.finditer(text):
                rate = self._parse_rate(match.group(1))
                vat_amount = self._clean_amount(match.group(2))

                if rate is not None and vat_amount and rate not in seen_rates:
                    seen_rates.add(rate)
                    breakdowns.append(
                        VATBreakdown(
                            rate=rate,
                            base_amount=None,
                            vat_amount=vat_amount,
                            source="regex",
                        )
                    )

        return breakdowns

    def _extract_total_excl(self, text: str) -> str | None:
        """Extract total excluding VAT."""
        # Look for specific patterns first
        patterns = [
            re.compile(r"[Ss]umma\s+exkl\.?\s*moms\s*:?\s*([\d\s,\.]+)"),
            re.compile(r"[Nn]etto\s*:?\s*([\d\s,\.]+)"),
            re.compile(r"[Ee]xkl\.?\s*moms\s*:?\s*([\d\s,\.]+)"),
        ]

        for pattern in patterns:
            match = pattern.search(text)
            if match:
                return self._clean_amount(match.group(1))

        return None

    def _extract_total_vat(self, text: str) -> str | None:
        """Extract total VAT amount."""
        patterns = [
            re.compile(r"[Ss]umma\s+moms\s*:?\s*([\d\s,\.]+)"),
            re.compile(r"[Tt]otal(?:t)?\s+moms\s*:?\s*([\d\s,\.]+)"),
            # Generic "Moms:" without percentage
            re.compile(r"^[Mm]oms\s*:?\s*([\d\s,\.]+)", re.MULTILINE),
        ]

        for pattern in patterns:
            match = pattern.search(text)
            if match:
                return self._clean_amount(match.group(1))

        return None

    def _extract_total_incl(self, text: str) -> str | None:
        """Extract total including VAT."""
        patterns = [
            re.compile(r"[Ss]umma\s+inkl\.?\s*moms\s*:?\s*([\d\s,\.]+)"),
            re.compile(r"[Tt]otal(?:t)?\s+att\s+betala\s*:?\s*([\d\s,\.]+)"),
            re.compile(r"[Bb]rutto\s*:?\s*([\d\s,\.]+)"),
            re.compile(r"[Aa]tt\s+betala\s*:?\s*([\d\s,\.]+)"),
        ]

        for pattern in patterns:
            match = pattern.search(text)
            if match:
                return self._clean_amount(match.group(1))

        return None

    def _parse_rate(self, rate_str: str) -> float | None:
        """Parse VAT rate string to float."""
        try:
            rate_str = rate_str.replace(",", ".")
            return float(rate_str)
        except (ValueError, TypeError):
            return None

    def _clean_amount(self, amount_str: str) -> str | None:
        """Clean and validate amount string."""
        if not amount_str:
            return None

        cleaned = amount_str.strip()

        # Remove trailing non-numeric chars (except comma/period)
        cleaned = re.sub(r"[^\d\s,\.]+$", "", cleaned).strip()

        if not cleaned:
            return None

        # Validate it parses as a number
        if self.amount_parser.parse(cleaned) is None:
            return None

        return cleaned

    def _calculate_confidence(
        self,
        breakdowns: list[VATBreakdown],
        total_excl: str | None,
        total_vat: str | None,
        total_incl: str | None,
    ) -> float:
        """Calculate confidence score based on extracted data."""
        score = 0.0

        # Has VAT breakdowns
        if breakdowns:
            score += 0.3

        # Has total excluding VAT
        if total_excl:
            score += 0.2

        # Has total VAT
        if total_vat:
            score += 0.2

        # Has total including VAT
        if total_incl:
            score += 0.15

        # Mathematical consistency check
        if total_excl and total_vat and total_incl:
            excl = self.amount_parser.parse(total_excl)
            vat = self.amount_parser.parse(total_vat)
            incl = self.amount_parser.parse(total_incl)

            if excl and vat and incl:
                expected = excl + vat
                if abs(expected - incl) < 0.02:  # Allow 2 cent tolerance
                    score += 0.15

        return min(score, 1.0)