invoice-master-poc-v2/packages/backend/backend/validation/vat_validator.py

"""
VAT Validator

Cross-validates VAT information from multiple sources:
- Mathematical verification (base × rate = vat)
- Line items vs VAT summary comparison
- Consistency with existing amount field
"""

from dataclasses import dataclass, field
from decimal import Decimal, InvalidOperation

from backend.vat.vat_extractor import VATSummary, AmountParser
from backend.table.line_items_extractor import LineItemsResult


@dataclass
class MathCheckResult:
    """Result of a single VAT rate mathematical check."""

    rate: float
    base_amount: float | None
    expected_vat: float | None
    actual_vat: float
    is_valid: bool
    tolerance: float


@dataclass
class VATValidationResult:
    """Complete VAT validation result."""

    is_valid: bool
    confidence_score: float  # 0.0 - 1.0

    # Mathematical verification
    math_checks: list[MathCheckResult]
    total_check: bool  # incl = excl + total_vat?

    # Source comparison
    line_items_vs_summary: bool | None  # line items total = VAT summary?
    amount_consistency: bool | None  # total_incl_vat = existing amount field?

    # Review flags
    needs_review: bool
    review_reasons: list[str] = field(default_factory=list)


class VATValidator:
    """Validates VAT information using multiple cross-checks."""

    def __init__(self, tolerance: float = 0.02):
        """
        Initialize validator.

        Args:
            tolerance: Acceptable difference for math checks (default 0.02 = 2 cents)
        """
        self.tolerance = tolerance
        self.amount_parser = AmountParser()

    def validate(
        self,
        vat_summary: VATSummary,
        line_items: LineItemsResult | None = None,
        existing_amount: str | None = None,
    ) -> VATValidationResult:
        """
        Validate VAT information.

        Args:
            vat_summary: Extracted VAT summary.
            line_items: Optional line items for comparison.
            existing_amount: Optional existing amount field from YOLO extraction.

        Returns:
            VATValidationResult with all check results.
        """
        review_reasons: list[str] = []

        # Handle empty summary
        if not vat_summary.breakdowns and not vat_summary.total_vat:
            return VATValidationResult(
                is_valid=False,
                confidence_score=0.0,
                math_checks=[],
                total_check=False,
                line_items_vs_summary=None,
                amount_consistency=None,
                needs_review=True,
                review_reasons=["No VAT information found"],
            )

        # Run all checks
        math_checks = self._run_math_checks(vat_summary)
        total_check = self._check_totals(vat_summary)
        line_items_check = self._check_line_items(vat_summary, line_items)
        amount_check = self._check_amount_consistency(vat_summary, existing_amount)

        # Collect review reasons
        math_failures = [c for c in math_checks if not c.is_valid]
        if math_failures:
            review_reasons.append(f"Math check failed for {len(math_failures)} VAT rate(s)")

        if not total_check:
            review_reasons.append("Total amount mismatch (excl + vat != incl)")

        if line_items_check is False:
            review_reasons.append("Line items total doesn't match VAT summary")

        if amount_check is False:
            review_reasons.append("VAT total doesn't match existing amount field")

        # Calculate overall validity and confidence
        all_math_valid = all(c.is_valid for c in math_checks) if math_checks else True
        is_valid = all_math_valid and total_check and (amount_check is not False)

        confidence_score = self._calculate_confidence(
            vat_summary, math_checks, total_check, line_items_check, amount_check
        )

        needs_review = len(review_reasons) > 0 or confidence_score < 0.7

        return VATValidationResult(
            is_valid=is_valid,
            confidence_score=confidence_score,
            math_checks=math_checks,
            total_check=total_check,
            line_items_vs_summary=line_items_check,
            amount_consistency=amount_check,
            needs_review=needs_review,
            review_reasons=review_reasons,
        )

    def _run_math_checks(self, vat_summary: VATSummary) -> list[MathCheckResult]:
        """Run mathematical verification for each VAT rate."""
        results = []

        for breakdown in vat_summary.breakdowns:
            actual_vat = self.amount_parser.parse(breakdown.vat_amount)
            if actual_vat is None:
                continue

            base_amount = None
            expected_vat = None
            is_valid = True

            if breakdown.base_amount:
                base_amount = self.amount_parser.parse(breakdown.base_amount)
                if base_amount is not None:
                    expected_vat = base_amount * (breakdown.rate / 100)
                    is_valid = abs(expected_vat - actual_vat) <= self.tolerance

            results.append(
                MathCheckResult(
                    rate=breakdown.rate,
                    base_amount=base_amount,
                    expected_vat=expected_vat,
                    actual_vat=actual_vat,
                    is_valid=is_valid,
                    tolerance=self.tolerance,
                )
            )

        return results

    def _check_totals(self, vat_summary: VATSummary) -> bool:
        """Check if total_excl + total_vat = total_incl."""
        if not vat_summary.total_excl_vat or not vat_summary.total_incl_vat:
            # Can't verify without both values
            return True  # Assume ok if we can't check

        excl = self.amount_parser.parse(vat_summary.total_excl_vat)
        incl = self.amount_parser.parse(vat_summary.total_incl_vat)

        if excl is None or incl is None:
            return True  # Can't verify

        # Calculate expected VAT
        if vat_summary.total_vat:
            vat = self.amount_parser.parse(vat_summary.total_vat)
            if vat is not None:
                expected_incl = excl + vat
                return abs(expected_incl - incl) <= self.tolerance
            # Can't verify if vat parsing failed
            return True
        else:
            # Sum up breakdown VAT amounts
            total_vat = sum(
                self.amount_parser.parse(b.vat_amount) or 0
                for b in vat_summary.breakdowns
            )
            expected_incl = excl + total_vat
            return abs(expected_incl - incl) <= self.tolerance

    def _check_line_items(
        self, vat_summary: VATSummary, line_items: LineItemsResult | None
    ) -> bool | None:
        """Check if line items total matches VAT summary."""
        if line_items is None or not line_items.items:
            return None  # No comparison possible

        # Sum line item amounts
        line_total = 0.0
        for item in line_items.items:
            if item.amount:
                amount = self.amount_parser.parse(item.amount)
                if amount is not None:
                    line_total += amount

        # Compare with VAT summary total
        if vat_summary.total_excl_vat:
            summary_total = self.amount_parser.parse(vat_summary.total_excl_vat)
            if summary_total is not None:
                # Allow larger tolerance for line items (rounding errors)
                return abs(line_total - summary_total) <= 1.0

        return None

    def _check_amount_consistency(
        self, vat_summary: VATSummary, existing_amount: str | None
    ) -> bool | None:
        """Check if VAT total matches existing amount field."""
        if existing_amount is None:
            return None  # No comparison possible

        existing = self.amount_parser.parse(existing_amount)
        if existing is None:
            return None

        if vat_summary.total_incl_vat:
            vat_total = self.amount_parser.parse(vat_summary.total_incl_vat)
            if vat_total is not None:
                return abs(existing - vat_total) <= self.tolerance

        return None

    def _calculate_confidence(
        self,
        vat_summary: VATSummary,
        math_checks: list[MathCheckResult],
        total_check: bool,
        line_items_check: bool | None,
        amount_check: bool | None,
    ) -> float:
        """Calculate overall confidence score."""
        score = vat_summary.confidence  # Start with extraction confidence

        # Adjust based on validation results
        if math_checks:
            math_valid_ratio = sum(1 for c in math_checks if c.is_valid) / len(math_checks)
            score = score * (0.5 + 0.5 * math_valid_ratio)

        if not total_check:
            score *= 0.5

        if line_items_check is True:
            score = min(score * 1.1, 1.0)  # Boost if line items match
        elif line_items_check is False:
            score *= 0.7

        if amount_check is True:
            score = min(score * 1.1, 1.0)  # Boost if amount matches
        elif amount_check is False:
            score *= 0.6

        return round(score, 2)