Files
invoice-master-poc-v2/packages/backend/backend/validation/vat_validator.py
2026-02-03 21:28:06 +01:00

268 lines
9.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
VAT Validator
Cross-validates VAT information from multiple sources:
- Mathematical verification (base × rate = vat)
- Line items vs VAT summary comparison
- Consistency with existing amount field
"""
from dataclasses import dataclass, field
from decimal import Decimal, InvalidOperation
from backend.vat.vat_extractor import VATSummary, AmountParser
from backend.table.line_items_extractor import LineItemsResult
@dataclass
class MathCheckResult:
"""Result of a single VAT rate mathematical check."""
rate: float
base_amount: float | None
expected_vat: float | None
actual_vat: float
is_valid: bool
tolerance: float
@dataclass
class VATValidationResult:
"""Complete VAT validation result."""
is_valid: bool
confidence_score: float # 0.0 - 1.0
# Mathematical verification
math_checks: list[MathCheckResult]
total_check: bool # incl = excl + total_vat?
# Source comparison
line_items_vs_summary: bool | None # line items total = VAT summary?
amount_consistency: bool | None # total_incl_vat = existing amount field?
# Review flags
needs_review: bool
review_reasons: list[str] = field(default_factory=list)
class VATValidator:
"""Validates VAT information using multiple cross-checks."""
def __init__(self, tolerance: float = 0.02):
"""
Initialize validator.
Args:
tolerance: Acceptable difference for math checks (default 0.02 = 2 cents)
"""
self.tolerance = tolerance
self.amount_parser = AmountParser()
def validate(
self,
vat_summary: VATSummary,
line_items: LineItemsResult | None = None,
existing_amount: str | None = None,
) -> VATValidationResult:
"""
Validate VAT information.
Args:
vat_summary: Extracted VAT summary.
line_items: Optional line items for comparison.
existing_amount: Optional existing amount field from YOLO extraction.
Returns:
VATValidationResult with all check results.
"""
review_reasons: list[str] = []
# Handle empty summary
if not vat_summary.breakdowns and not vat_summary.total_vat:
return VATValidationResult(
is_valid=False,
confidence_score=0.0,
math_checks=[],
total_check=False,
line_items_vs_summary=None,
amount_consistency=None,
needs_review=True,
review_reasons=["No VAT information found"],
)
# Run all checks
math_checks = self._run_math_checks(vat_summary)
total_check = self._check_totals(vat_summary)
line_items_check = self._check_line_items(vat_summary, line_items)
amount_check = self._check_amount_consistency(vat_summary, existing_amount)
# Collect review reasons
math_failures = [c for c in math_checks if not c.is_valid]
if math_failures:
review_reasons.append(f"Math check failed for {len(math_failures)} VAT rate(s)")
if not total_check:
review_reasons.append("Total amount mismatch (excl + vat != incl)")
if line_items_check is False:
review_reasons.append("Line items total doesn't match VAT summary")
if amount_check is False:
review_reasons.append("VAT total doesn't match existing amount field")
# Calculate overall validity and confidence
all_math_valid = all(c.is_valid for c in math_checks) if math_checks else True
is_valid = all_math_valid and total_check and (amount_check is not False)
confidence_score = self._calculate_confidence(
vat_summary, math_checks, total_check, line_items_check, amount_check
)
needs_review = len(review_reasons) > 0 or confidence_score < 0.7
return VATValidationResult(
is_valid=is_valid,
confidence_score=confidence_score,
math_checks=math_checks,
total_check=total_check,
line_items_vs_summary=line_items_check,
amount_consistency=amount_check,
needs_review=needs_review,
review_reasons=review_reasons,
)
def _run_math_checks(self, vat_summary: VATSummary) -> list[MathCheckResult]:
"""Run mathematical verification for each VAT rate."""
results = []
for breakdown in vat_summary.breakdowns:
actual_vat = self.amount_parser.parse(breakdown.vat_amount)
if actual_vat is None:
continue
base_amount = None
expected_vat = None
is_valid = True
if breakdown.base_amount:
base_amount = self.amount_parser.parse(breakdown.base_amount)
if base_amount is not None:
expected_vat = base_amount * (breakdown.rate / 100)
is_valid = abs(expected_vat - actual_vat) <= self.tolerance
results.append(
MathCheckResult(
rate=breakdown.rate,
base_amount=base_amount,
expected_vat=expected_vat,
actual_vat=actual_vat,
is_valid=is_valid,
tolerance=self.tolerance,
)
)
return results
def _check_totals(self, vat_summary: VATSummary) -> bool:
"""Check if total_excl + total_vat = total_incl."""
if not vat_summary.total_excl_vat or not vat_summary.total_incl_vat:
# Can't verify without both values
return True # Assume ok if we can't check
excl = self.amount_parser.parse(vat_summary.total_excl_vat)
incl = self.amount_parser.parse(vat_summary.total_incl_vat)
if excl is None or incl is None:
return True # Can't verify
# Calculate expected VAT
if vat_summary.total_vat:
vat = self.amount_parser.parse(vat_summary.total_vat)
if vat is not None:
expected_incl = excl + vat
return abs(expected_incl - incl) <= self.tolerance
# Can't verify if vat parsing failed
return True
else:
# Sum up breakdown VAT amounts
total_vat = sum(
self.amount_parser.parse(b.vat_amount) or 0
for b in vat_summary.breakdowns
)
expected_incl = excl + total_vat
return abs(expected_incl - incl) <= self.tolerance
def _check_line_items(
self, vat_summary: VATSummary, line_items: LineItemsResult | None
) -> bool | None:
"""Check if line items total matches VAT summary."""
if line_items is None or not line_items.items:
return None # No comparison possible
# Sum line item amounts
line_total = 0.0
for item in line_items.items:
if item.amount:
amount = self.amount_parser.parse(item.amount)
if amount is not None:
line_total += amount
# Compare with VAT summary total
if vat_summary.total_excl_vat:
summary_total = self.amount_parser.parse(vat_summary.total_excl_vat)
if summary_total is not None:
# Allow larger tolerance for line items (rounding errors)
return abs(line_total - summary_total) <= 1.0
return None
def _check_amount_consistency(
self, vat_summary: VATSummary, existing_amount: str | None
) -> bool | None:
"""Check if VAT total matches existing amount field."""
if existing_amount is None:
return None # No comparison possible
existing = self.amount_parser.parse(existing_amount)
if existing is None:
return None
if vat_summary.total_incl_vat:
vat_total = self.amount_parser.parse(vat_summary.total_incl_vat)
if vat_total is not None:
return abs(existing - vat_total) <= self.tolerance
return None
def _calculate_confidence(
self,
vat_summary: VATSummary,
math_checks: list[MathCheckResult],
total_check: bool,
line_items_check: bool | None,
amount_check: bool | None,
) -> float:
"""Calculate overall confidence score."""
score = vat_summary.confidence # Start with extraction confidence
# Adjust based on validation results
if math_checks:
math_valid_ratio = sum(1 for c in math_checks if c.is_valid) / len(math_checks)
score = score * (0.5 + 0.5 * math_valid_ratio)
if not total_check:
score *= 0.5
if line_items_check is True:
score = min(score * 1.1, 1.0) # Boost if line items match
elif line_items_check is False:
score *= 0.7
if amount_check is True:
score = min(score * 1.1, 1.0) # Boost if amount matches
elif amount_check is False:
score *= 0.6
return round(score, 2)