Files
invoice-master-poc-v2/packages/backend/backend/vat/vat_extractor.py
2026-02-03 21:28:06 +01:00

351 lines
11 KiB
Python

"""
VAT Extractor
Extracts VAT (Moms) information from Swedish invoice text using regex patterns.
Supports multiple VAT rates (25%, 12%, 6%, 0%) and various Swedish formats.
"""
from dataclasses import dataclass
import re
from decimal import Decimal, InvalidOperation
@dataclass
class VATBreakdown:
"""Single VAT rate breakdown."""
rate: float # 25.0, 12.0, 6.0, 0.0
base_amount: str | None # Tax base (excl VAT)
vat_amount: str # VAT amount
source: str # 'regex' | 'line_items'
@dataclass
class VATSummary:
"""Complete VAT summary."""
breakdowns: list[VATBreakdown]
total_excl_vat: str | None
total_vat: str | None
total_incl_vat: str | None
confidence: float
class AmountParser:
"""Parse Swedish and European number formats."""
# Patterns to clean amount strings
CURRENCY_PATTERN = re.compile(r"(SEK|kr|:-)\s*", re.IGNORECASE)
def parse(self, amount_str: str) -> float | None:
"""
Parse amount string to float.
Handles:
- Swedish: 1 234,56
- European: 1.234,56
- US: 1,234.56
Args:
amount_str: Amount string to parse.
Returns:
Parsed float value or None if invalid.
"""
if not amount_str or not amount_str.strip():
return None
# Clean the string
cleaned = amount_str.strip()
# Remove currency
cleaned = self.CURRENCY_PATTERN.sub("", cleaned).strip()
cleaned = re.sub(r"^SEK\s*", "", cleaned, flags=re.IGNORECASE)
if not cleaned:
return None
# Check for negative
is_negative = cleaned.startswith("-")
if is_negative:
cleaned = cleaned[1:].strip()
try:
# Remove spaces (Swedish thousands separator)
cleaned = cleaned.replace(" ", "")
# Detect format
# Swedish/European: comma is decimal separator
# US: period is decimal separator
has_comma = "," in cleaned
has_period = "." in cleaned
if has_comma and has_period:
# Both present - check position
comma_pos = cleaned.rfind(",")
period_pos = cleaned.rfind(".")
if comma_pos > period_pos:
# European: 1.234,56
cleaned = cleaned.replace(".", "")
cleaned = cleaned.replace(",", ".")
else:
# US: 1,234.56
cleaned = cleaned.replace(",", "")
elif has_comma:
# Swedish: 1234,56
cleaned = cleaned.replace(",", ".")
# else: US format or integer
value = float(cleaned)
return -value if is_negative else value
except (ValueError, InvalidOperation):
return None
class VATExtractor:
"""Extract VAT information from invoice text."""
# VAT extraction patterns
# Note: Amount pattern uses [^\n] to avoid crossing line boundaries
VAT_PATTERNS = [
# Moms 25%: 2 500,00 or Moms 25% 2 500,00
re.compile(
r"[Mm]oms\s*(\d+(?:[,\.]\d+)?)\s*%\s*:?\s*([\d ,\.]+?)(?:\s*$|\s+[a-zA-Z])",
re.MULTILINE,
),
# Varav moms 25% 2 500,00
re.compile(
r"[Vv]arav\s+moms\s+(\d+(?:[,\.]\d+)?)\s*%\s*([\d ,\.]+?)(?:\s*$|\s+[a-zA-Z])",
re.MULTILINE,
),
# 25% moms: 2 500,00 (at line start or after whitespace)
re.compile(
r"(?:^|\s)(\d+(?:[,\.]\d+)?)\s*%\s*moms\s*:?\s*([\d ,\.]+?)(?:\s*$|\s+[a-zA-Z])",
re.MULTILINE,
),
# Moms (25%): 2 500,00
re.compile(
r"[Mm]oms\s*\((\d+(?:[,\.]\d+)?)\s*%\)\s*:?\s*([\d ,\.]+?)(?:\s*$|\s+[a-zA-Z])",
re.MULTILINE,
),
]
# Pattern with base amount (Underlag)
VAT_WITH_BASE_PATTERN = re.compile(
r"[Mm]oms\s*(\d+(?:[,\.]\d+)?)\s*%\s*:?\s*([\d\s,\.]+)"
r"(?:.*?[Uu]nderlag\s*([\d\s,\.]+))?",
re.MULTILINE | re.DOTALL,
)
# Total patterns
TOTAL_EXCL_PATTERN = re.compile(
r"(?:[Ss]umma|[Tt]otal(?:t)?|[Nn]etto)\s*(?:exkl\.?\s*)?(?:moms)?\s*:?\s*([\d\s,\.]+)",
re.MULTILINE,
)
TOTAL_VAT_PATTERN = re.compile(
r"(?:[Ss]umma|[Tt]otal(?:t)?)\s*moms\s*:?\s*([\d\s,\.]+)",
re.MULTILINE,
)
TOTAL_INCL_PATTERN = re.compile(
r"(?:[Ss]umma|[Tt]otal(?:t)?|[Bb]rutto)\s*(?:inkl\.?\s*)?(?:moms|att\s*betala)?\s*:?\s*([\d\s,\.]+)",
re.MULTILINE,
)
def __init__(self):
self.amount_parser = AmountParser()
def extract(self, text: str) -> VATSummary:
"""
Extract VAT information from text.
Args:
text: Invoice text (OCR output).
Returns:
VATSummary with extracted information.
"""
if not text or not text.strip():
return VATSummary(
breakdowns=[],
total_excl_vat=None,
total_vat=None,
total_incl_vat=None,
confidence=0.0,
)
breakdowns = self._extract_breakdowns(text)
total_excl = self._extract_total_excl(text)
total_vat = self._extract_total_vat(text)
total_incl = self._extract_total_incl(text)
confidence = self._calculate_confidence(
breakdowns, total_excl, total_vat, total_incl
)
return VATSummary(
breakdowns=breakdowns,
total_excl_vat=total_excl,
total_vat=total_vat,
total_incl_vat=total_incl,
confidence=confidence,
)
def _extract_breakdowns(self, text: str) -> list[VATBreakdown]:
"""Extract individual VAT rate breakdowns."""
breakdowns = []
seen_rates = set()
# Try pattern with base amount first
for match in self.VAT_WITH_BASE_PATTERN.finditer(text):
rate = self._parse_rate(match.group(1))
vat_amount = self._clean_amount(match.group(2))
base_amount = (
self._clean_amount(match.group(3)) if match.group(3) else None
)
if rate is not None and vat_amount and rate not in seen_rates:
seen_rates.add(rate)
breakdowns.append(
VATBreakdown(
rate=rate,
base_amount=base_amount,
vat_amount=vat_amount,
source="regex",
)
)
# Try other patterns
for pattern in self.VAT_PATTERNS:
for match in pattern.finditer(text):
rate = self._parse_rate(match.group(1))
vat_amount = self._clean_amount(match.group(2))
if rate is not None and vat_amount and rate not in seen_rates:
seen_rates.add(rate)
breakdowns.append(
VATBreakdown(
rate=rate,
base_amount=None,
vat_amount=vat_amount,
source="regex",
)
)
return breakdowns
def _extract_total_excl(self, text: str) -> str | None:
"""Extract total excluding VAT."""
# Look for specific patterns first
patterns = [
re.compile(r"[Ss]umma\s+exkl\.?\s*moms\s*:?\s*([\d\s,\.]+)"),
re.compile(r"[Nn]etto\s*:?\s*([\d\s,\.]+)"),
re.compile(r"[Ee]xkl\.?\s*moms\s*:?\s*([\d\s,\.]+)"),
]
for pattern in patterns:
match = pattern.search(text)
if match:
return self._clean_amount(match.group(1))
return None
def _extract_total_vat(self, text: str) -> str | None:
"""Extract total VAT amount."""
patterns = [
re.compile(r"[Ss]umma\s+moms\s*:?\s*([\d\s,\.]+)"),
re.compile(r"[Tt]otal(?:t)?\s+moms\s*:?\s*([\d\s,\.]+)"),
# Generic "Moms:" without percentage
re.compile(r"^[Mm]oms\s*:?\s*([\d\s,\.]+)", re.MULTILINE),
]
for pattern in patterns:
match = pattern.search(text)
if match:
return self._clean_amount(match.group(1))
return None
def _extract_total_incl(self, text: str) -> str | None:
"""Extract total including VAT."""
patterns = [
re.compile(r"[Ss]umma\s+inkl\.?\s*moms\s*:?\s*([\d\s,\.]+)"),
re.compile(r"[Tt]otal(?:t)?\s+att\s+betala\s*:?\s*([\d\s,\.]+)"),
re.compile(r"[Bb]rutto\s*:?\s*([\d\s,\.]+)"),
re.compile(r"[Aa]tt\s+betala\s*:?\s*([\d\s,\.]+)"),
]
for pattern in patterns:
match = pattern.search(text)
if match:
return self._clean_amount(match.group(1))
return None
def _parse_rate(self, rate_str: str) -> float | None:
"""Parse VAT rate string to float."""
try:
rate_str = rate_str.replace(",", ".")
return float(rate_str)
except (ValueError, TypeError):
return None
def _clean_amount(self, amount_str: str) -> str | None:
"""Clean and validate amount string."""
if not amount_str:
return None
cleaned = amount_str.strip()
# Remove trailing non-numeric chars (except comma/period)
cleaned = re.sub(r"[^\d\s,\.]+$", "", cleaned).strip()
if not cleaned:
return None
# Validate it parses as a number
if self.amount_parser.parse(cleaned) is None:
return None
return cleaned
def _calculate_confidence(
self,
breakdowns: list[VATBreakdown],
total_excl: str | None,
total_vat: str | None,
total_incl: str | None,
) -> float:
"""Calculate confidence score based on extracted data."""
score = 0.0
# Has VAT breakdowns
if breakdowns:
score += 0.3
# Has total excluding VAT
if total_excl:
score += 0.2
# Has total VAT
if total_vat:
score += 0.2
# Has total including VAT
if total_incl:
score += 0.15
# Mathematical consistency check
if total_excl and total_vat and total_incl:
excl = self.amount_parser.parse(total_excl)
vat = self.amount_parser.parse(total_vat)
incl = self.amount_parser.parse(total_incl)
if excl and vat and incl:
expected = excl + vat
if abs(expected - incl) < 0.02: # Allow 2 cent tolerance
score += 0.15
return min(score, 1.0)