268 lines
9.0 KiB
Python
268 lines
9.0 KiB
Python
"""
|
||
VAT Validator
|
||
|
||
Cross-validates VAT information from multiple sources:
|
||
- Mathematical verification (base × rate = vat)
|
||
- Line items vs VAT summary comparison
|
||
- Consistency with existing amount field
|
||
"""
|
||
|
||
from dataclasses import dataclass, field
|
||
from decimal import Decimal, InvalidOperation
|
||
|
||
from backend.vat.vat_extractor import VATSummary, AmountParser
|
||
from backend.table.line_items_extractor import LineItemsResult
|
||
|
||
|
||
@dataclass
|
||
class MathCheckResult:
|
||
"""Result of a single VAT rate mathematical check."""
|
||
|
||
rate: float
|
||
base_amount: float | None
|
||
expected_vat: float | None
|
||
actual_vat: float
|
||
is_valid: bool
|
||
tolerance: float
|
||
|
||
|
||
@dataclass
|
||
class VATValidationResult:
|
||
"""Complete VAT validation result."""
|
||
|
||
is_valid: bool
|
||
confidence_score: float # 0.0 - 1.0
|
||
|
||
# Mathematical verification
|
||
math_checks: list[MathCheckResult]
|
||
total_check: bool # incl = excl + total_vat?
|
||
|
||
# Source comparison
|
||
line_items_vs_summary: bool | None # line items total = VAT summary?
|
||
amount_consistency: bool | None # total_incl_vat = existing amount field?
|
||
|
||
# Review flags
|
||
needs_review: bool
|
||
review_reasons: list[str] = field(default_factory=list)
|
||
|
||
|
||
class VATValidator:
|
||
"""Validates VAT information using multiple cross-checks."""
|
||
|
||
def __init__(self, tolerance: float = 0.02):
|
||
"""
|
||
Initialize validator.
|
||
|
||
Args:
|
||
tolerance: Acceptable difference for math checks (default 0.02 = 2 cents)
|
||
"""
|
||
self.tolerance = tolerance
|
||
self.amount_parser = AmountParser()
|
||
|
||
def validate(
|
||
self,
|
||
vat_summary: VATSummary,
|
||
line_items: LineItemsResult | None = None,
|
||
existing_amount: str | None = None,
|
||
) -> VATValidationResult:
|
||
"""
|
||
Validate VAT information.
|
||
|
||
Args:
|
||
vat_summary: Extracted VAT summary.
|
||
line_items: Optional line items for comparison.
|
||
existing_amount: Optional existing amount field from YOLO extraction.
|
||
|
||
Returns:
|
||
VATValidationResult with all check results.
|
||
"""
|
||
review_reasons: list[str] = []
|
||
|
||
# Handle empty summary
|
||
if not vat_summary.breakdowns and not vat_summary.total_vat:
|
||
return VATValidationResult(
|
||
is_valid=False,
|
||
confidence_score=0.0,
|
||
math_checks=[],
|
||
total_check=False,
|
||
line_items_vs_summary=None,
|
||
amount_consistency=None,
|
||
needs_review=True,
|
||
review_reasons=["No VAT information found"],
|
||
)
|
||
|
||
# Run all checks
|
||
math_checks = self._run_math_checks(vat_summary)
|
||
total_check = self._check_totals(vat_summary)
|
||
line_items_check = self._check_line_items(vat_summary, line_items)
|
||
amount_check = self._check_amount_consistency(vat_summary, existing_amount)
|
||
|
||
# Collect review reasons
|
||
math_failures = [c for c in math_checks if not c.is_valid]
|
||
if math_failures:
|
||
review_reasons.append(f"Math check failed for {len(math_failures)} VAT rate(s)")
|
||
|
||
if not total_check:
|
||
review_reasons.append("Total amount mismatch (excl + vat != incl)")
|
||
|
||
if line_items_check is False:
|
||
review_reasons.append("Line items total doesn't match VAT summary")
|
||
|
||
if amount_check is False:
|
||
review_reasons.append("VAT total doesn't match existing amount field")
|
||
|
||
# Calculate overall validity and confidence
|
||
all_math_valid = all(c.is_valid for c in math_checks) if math_checks else True
|
||
is_valid = all_math_valid and total_check and (amount_check is not False)
|
||
|
||
confidence_score = self._calculate_confidence(
|
||
vat_summary, math_checks, total_check, line_items_check, amount_check
|
||
)
|
||
|
||
needs_review = len(review_reasons) > 0 or confidence_score < 0.7
|
||
|
||
return VATValidationResult(
|
||
is_valid=is_valid,
|
||
confidence_score=confidence_score,
|
||
math_checks=math_checks,
|
||
total_check=total_check,
|
||
line_items_vs_summary=line_items_check,
|
||
amount_consistency=amount_check,
|
||
needs_review=needs_review,
|
||
review_reasons=review_reasons,
|
||
)
|
||
|
||
def _run_math_checks(self, vat_summary: VATSummary) -> list[MathCheckResult]:
|
||
"""Run mathematical verification for each VAT rate."""
|
||
results = []
|
||
|
||
for breakdown in vat_summary.breakdowns:
|
||
actual_vat = self.amount_parser.parse(breakdown.vat_amount)
|
||
if actual_vat is None:
|
||
continue
|
||
|
||
base_amount = None
|
||
expected_vat = None
|
||
is_valid = True
|
||
|
||
if breakdown.base_amount:
|
||
base_amount = self.amount_parser.parse(breakdown.base_amount)
|
||
if base_amount is not None:
|
||
expected_vat = base_amount * (breakdown.rate / 100)
|
||
is_valid = abs(expected_vat - actual_vat) <= self.tolerance
|
||
|
||
results.append(
|
||
MathCheckResult(
|
||
rate=breakdown.rate,
|
||
base_amount=base_amount,
|
||
expected_vat=expected_vat,
|
||
actual_vat=actual_vat,
|
||
is_valid=is_valid,
|
||
tolerance=self.tolerance,
|
||
)
|
||
)
|
||
|
||
return results
|
||
|
||
def _check_totals(self, vat_summary: VATSummary) -> bool:
|
||
"""Check if total_excl + total_vat = total_incl."""
|
||
if not vat_summary.total_excl_vat or not vat_summary.total_incl_vat:
|
||
# Can't verify without both values
|
||
return True # Assume ok if we can't check
|
||
|
||
excl = self.amount_parser.parse(vat_summary.total_excl_vat)
|
||
incl = self.amount_parser.parse(vat_summary.total_incl_vat)
|
||
|
||
if excl is None or incl is None:
|
||
return True # Can't verify
|
||
|
||
# Calculate expected VAT
|
||
if vat_summary.total_vat:
|
||
vat = self.amount_parser.parse(vat_summary.total_vat)
|
||
if vat is not None:
|
||
expected_incl = excl + vat
|
||
return abs(expected_incl - incl) <= self.tolerance
|
||
# Can't verify if vat parsing failed
|
||
return True
|
||
else:
|
||
# Sum up breakdown VAT amounts
|
||
total_vat = sum(
|
||
self.amount_parser.parse(b.vat_amount) or 0
|
||
for b in vat_summary.breakdowns
|
||
)
|
||
expected_incl = excl + total_vat
|
||
return abs(expected_incl - incl) <= self.tolerance
|
||
|
||
def _check_line_items(
|
||
self, vat_summary: VATSummary, line_items: LineItemsResult | None
|
||
) -> bool | None:
|
||
"""Check if line items total matches VAT summary."""
|
||
if line_items is None or not line_items.items:
|
||
return None # No comparison possible
|
||
|
||
# Sum line item amounts
|
||
line_total = 0.0
|
||
for item in line_items.items:
|
||
if item.amount:
|
||
amount = self.amount_parser.parse(item.amount)
|
||
if amount is not None:
|
||
line_total += amount
|
||
|
||
# Compare with VAT summary total
|
||
if vat_summary.total_excl_vat:
|
||
summary_total = self.amount_parser.parse(vat_summary.total_excl_vat)
|
||
if summary_total is not None:
|
||
# Allow larger tolerance for line items (rounding errors)
|
||
return abs(line_total - summary_total) <= 1.0
|
||
|
||
return None
|
||
|
||
def _check_amount_consistency(
|
||
self, vat_summary: VATSummary, existing_amount: str | None
|
||
) -> bool | None:
|
||
"""Check if VAT total matches existing amount field."""
|
||
if existing_amount is None:
|
||
return None # No comparison possible
|
||
|
||
existing = self.amount_parser.parse(existing_amount)
|
||
if existing is None:
|
||
return None
|
||
|
||
if vat_summary.total_incl_vat:
|
||
vat_total = self.amount_parser.parse(vat_summary.total_incl_vat)
|
||
if vat_total is not None:
|
||
return abs(existing - vat_total) <= self.tolerance
|
||
|
||
return None
|
||
|
||
def _calculate_confidence(
|
||
self,
|
||
vat_summary: VATSummary,
|
||
math_checks: list[MathCheckResult],
|
||
total_check: bool,
|
||
line_items_check: bool | None,
|
||
amount_check: bool | None,
|
||
) -> float:
|
||
"""Calculate overall confidence score."""
|
||
score = vat_summary.confidence # Start with extraction confidence
|
||
|
||
# Adjust based on validation results
|
||
if math_checks:
|
||
math_valid_ratio = sum(1 for c in math_checks if c.is_valid) / len(math_checks)
|
||
score = score * (0.5 + 0.5 * math_valid_ratio)
|
||
|
||
if not total_check:
|
||
score *= 0.5
|
||
|
||
if line_items_check is True:
|
||
score = min(score * 1.1, 1.0) # Boost if line items match
|
||
elif line_items_check is False:
|
||
score *= 0.7
|
||
|
||
if amount_check is True:
|
||
score = min(score * 1.1, 1.0) # Boost if amount matches
|
||
elif amount_check is False:
|
||
score *= 0.6
|
||
|
||
return round(score, 2)
|