invoice-master-poc-v2/packages/backend/backend/domain/invoice_validator.py

"""
Invoice Validator

Business logic for validating extracted invoice fields.
Checks for required fields, format validity, and confidence thresholds.
"""
from __future__ import annotations

from dataclasses import dataclass

from backend.domain.utils import has_value


@dataclass(frozen=True)
class ValidationIssue:
    """
    Single validation issue.

    Attributes:
        field: Name of the field with the issue
        severity: One of "error", "warning", "info"
        message: Human-readable description of the issue
    """

    field: str
    severity: str
    message: str


@dataclass(frozen=True)
class ValidationResult:
    """
    Immutable result of invoice validation.

    Attributes:
        is_valid: True if no errors (warnings are allowed)
        issues: Tuple of validation issues found
        confidence: Average confidence score of validated fields
    """

    is_valid: bool
    issues: tuple[ValidationIssue, ...]
    confidence: float


class InvoiceValidator:
    """
    Validates extracted invoice fields for completeness and consistency.

    Validation Rules:
    1. Required fields must be present (Amount)
    2. At least one payment reference should be present (warning if missing)
    3. Field confidence should be above threshold (warning if below)

    Required fields:
    - Amount

    Payment reference fields (at least one expected):
    - OCR
    - Bankgiro
    - Plusgiro
    - payment_line
    """

    REQUIRED_FIELDS: tuple[str, ...] = ("Amount",)
    PAYMENT_REF_FIELDS: tuple[str, ...] = ("OCR", "Bankgiro", "Plusgiro", "payment_line")
    DEFAULT_MIN_CONFIDENCE: float = 0.5

    def __init__(self, min_confidence: float = DEFAULT_MIN_CONFIDENCE) -> None:
        """
        Initialize validator.

        Args:
            min_confidence: Minimum confidence threshold for valid fields.
                           Fields below this threshold produce warnings.
        """
        self._min_confidence = min_confidence

    def validate(
        self,
        fields: dict[str, str | None],
        confidence: dict[str, float],
    ) -> ValidationResult:
        """
        Validate extracted invoice fields.

        Args:
            fields: Dictionary of field names to extracted values
            confidence: Dictionary of field names to confidence scores

        Returns:
            Immutable ValidationResult with validity status and issues
        """
        issues: list[ValidationIssue] = []

        # Check required fields
        for field in self.REQUIRED_FIELDS:
            if not has_value(fields.get(field)):
                issues.append(
                    ValidationIssue(
                        field=field,
                        severity="error",
                        message=f"Required field '{field}' is missing",
                    )
                )

        # Check payment reference (at least one expected)
        has_payment_ref = any(
            has_value(fields.get(f)) for f in self.PAYMENT_REF_FIELDS
        )
        if not has_payment_ref:
            issues.append(
                ValidationIssue(
                    field="payment_reference",
                    severity="warning",
                    message="No payment reference (OCR, Bankgiro, Plusgiro, or payment_line)",
                )
            )

        # Check confidence thresholds
        for field, conf in confidence.items():
            if conf < self._min_confidence:
                issues.append(
                    ValidationIssue(
                        field=field,
                        severity="warning",
                        message=f"Low confidence ({conf:.2f}) for field '{field}'",
                    )
                )

        # Calculate overall validity
        has_errors = any(i.severity == "error" for i in issues)
        avg_confidence = (
            sum(confidence.values()) / len(confidence) if confidence else 0.0
        )

        return ValidationResult(
            is_valid=not has_errors,
            issues=tuple(issues),
            confidence=avg_confidence,
        )