351 lines
11 KiB
Python
351 lines
11 KiB
Python
"""
|
|
VAT Extractor
|
|
|
|
Extracts VAT (Moms) information from Swedish invoice text using regex patterns.
|
|
Supports multiple VAT rates (25%, 12%, 6%, 0%) and various Swedish formats.
|
|
"""
|
|
|
|
from dataclasses import dataclass
|
|
import re
|
|
from decimal import Decimal, InvalidOperation
|
|
|
|
|
|
@dataclass
|
|
class VATBreakdown:
|
|
"""Single VAT rate breakdown."""
|
|
|
|
rate: float # 25.0, 12.0, 6.0, 0.0
|
|
base_amount: str | None # Tax base (excl VAT)
|
|
vat_amount: str # VAT amount
|
|
source: str # 'regex' | 'line_items'
|
|
|
|
|
|
@dataclass
|
|
class VATSummary:
|
|
"""Complete VAT summary."""
|
|
|
|
breakdowns: list[VATBreakdown]
|
|
total_excl_vat: str | None
|
|
total_vat: str | None
|
|
total_incl_vat: str | None
|
|
confidence: float
|
|
|
|
|
|
class AmountParser:
|
|
"""Parse Swedish and European number formats."""
|
|
|
|
# Patterns to clean amount strings
|
|
CURRENCY_PATTERN = re.compile(r"(SEK|kr|:-)\s*", re.IGNORECASE)
|
|
|
|
def parse(self, amount_str: str) -> float | None:
|
|
"""
|
|
Parse amount string to float.
|
|
|
|
Handles:
|
|
- Swedish: 1 234,56
|
|
- European: 1.234,56
|
|
- US: 1,234.56
|
|
|
|
Args:
|
|
amount_str: Amount string to parse.
|
|
|
|
Returns:
|
|
Parsed float value or None if invalid.
|
|
"""
|
|
if not amount_str or not amount_str.strip():
|
|
return None
|
|
|
|
# Clean the string
|
|
cleaned = amount_str.strip()
|
|
|
|
# Remove currency
|
|
cleaned = self.CURRENCY_PATTERN.sub("", cleaned).strip()
|
|
cleaned = re.sub(r"^SEK\s*", "", cleaned, flags=re.IGNORECASE)
|
|
|
|
if not cleaned:
|
|
return None
|
|
|
|
# Check for negative
|
|
is_negative = cleaned.startswith("-")
|
|
if is_negative:
|
|
cleaned = cleaned[1:].strip()
|
|
|
|
try:
|
|
# Remove spaces (Swedish thousands separator)
|
|
cleaned = cleaned.replace(" ", "")
|
|
|
|
# Detect format
|
|
# Swedish/European: comma is decimal separator
|
|
# US: period is decimal separator
|
|
has_comma = "," in cleaned
|
|
has_period = "." in cleaned
|
|
|
|
if has_comma and has_period:
|
|
# Both present - check position
|
|
comma_pos = cleaned.rfind(",")
|
|
period_pos = cleaned.rfind(".")
|
|
|
|
if comma_pos > period_pos:
|
|
# European: 1.234,56
|
|
cleaned = cleaned.replace(".", "")
|
|
cleaned = cleaned.replace(",", ".")
|
|
else:
|
|
# US: 1,234.56
|
|
cleaned = cleaned.replace(",", "")
|
|
elif has_comma:
|
|
# Swedish: 1234,56
|
|
cleaned = cleaned.replace(",", ".")
|
|
# else: US format or integer
|
|
|
|
value = float(cleaned)
|
|
return -value if is_negative else value
|
|
|
|
except (ValueError, InvalidOperation):
|
|
return None
|
|
|
|
|
|
class VATExtractor:
|
|
"""Extract VAT information from invoice text."""
|
|
|
|
# VAT extraction patterns
|
|
# Note: Amount pattern uses [^\n] to avoid crossing line boundaries
|
|
VAT_PATTERNS = [
|
|
# Moms 25%: 2 500,00 or Moms 25% 2 500,00
|
|
re.compile(
|
|
r"[Mm]oms\s*(\d+(?:[,\.]\d+)?)\s*%\s*:?\s*([\d ,\.]+?)(?:\s*$|\s+[a-zA-Z])",
|
|
re.MULTILINE,
|
|
),
|
|
# Varav moms 25% 2 500,00
|
|
re.compile(
|
|
r"[Vv]arav\s+moms\s+(\d+(?:[,\.]\d+)?)\s*%\s*([\d ,\.]+?)(?:\s*$|\s+[a-zA-Z])",
|
|
re.MULTILINE,
|
|
),
|
|
# 25% moms: 2 500,00 (at line start or after whitespace)
|
|
re.compile(
|
|
r"(?:^|\s)(\d+(?:[,\.]\d+)?)\s*%\s*moms\s*:?\s*([\d ,\.]+?)(?:\s*$|\s+[a-zA-Z])",
|
|
re.MULTILINE,
|
|
),
|
|
# Moms (25%): 2 500,00
|
|
re.compile(
|
|
r"[Mm]oms\s*\((\d+(?:[,\.]\d+)?)\s*%\)\s*:?\s*([\d ,\.]+?)(?:\s*$|\s+[a-zA-Z])",
|
|
re.MULTILINE,
|
|
),
|
|
]
|
|
|
|
# Pattern with base amount (Underlag)
|
|
VAT_WITH_BASE_PATTERN = re.compile(
|
|
r"[Mm]oms\s*(\d+(?:[,\.]\d+)?)\s*%\s*:?\s*([\d\s,\.]+)"
|
|
r"(?:.*?[Uu]nderlag\s*([\d\s,\.]+))?",
|
|
re.MULTILINE | re.DOTALL,
|
|
)
|
|
|
|
# Total patterns
|
|
TOTAL_EXCL_PATTERN = re.compile(
|
|
r"(?:[Ss]umma|[Tt]otal(?:t)?|[Nn]etto)\s*(?:exkl\.?\s*)?(?:moms)?\s*:?\s*([\d\s,\.]+)",
|
|
re.MULTILINE,
|
|
)
|
|
TOTAL_VAT_PATTERN = re.compile(
|
|
r"(?:[Ss]umma|[Tt]otal(?:t)?)\s*moms\s*:?\s*([\d\s,\.]+)",
|
|
re.MULTILINE,
|
|
)
|
|
TOTAL_INCL_PATTERN = re.compile(
|
|
r"(?:[Ss]umma|[Tt]otal(?:t)?|[Bb]rutto)\s*(?:inkl\.?\s*)?(?:moms|att\s*betala)?\s*:?\s*([\d\s,\.]+)",
|
|
re.MULTILINE,
|
|
)
|
|
|
|
def __init__(self):
|
|
self.amount_parser = AmountParser()
|
|
|
|
def extract(self, text: str) -> VATSummary:
|
|
"""
|
|
Extract VAT information from text.
|
|
|
|
Args:
|
|
text: Invoice text (OCR output).
|
|
|
|
Returns:
|
|
VATSummary with extracted information.
|
|
"""
|
|
if not text or not text.strip():
|
|
return VATSummary(
|
|
breakdowns=[],
|
|
total_excl_vat=None,
|
|
total_vat=None,
|
|
total_incl_vat=None,
|
|
confidence=0.0,
|
|
)
|
|
|
|
breakdowns = self._extract_breakdowns(text)
|
|
total_excl = self._extract_total_excl(text)
|
|
total_vat = self._extract_total_vat(text)
|
|
total_incl = self._extract_total_incl(text)
|
|
|
|
confidence = self._calculate_confidence(
|
|
breakdowns, total_excl, total_vat, total_incl
|
|
)
|
|
|
|
return VATSummary(
|
|
breakdowns=breakdowns,
|
|
total_excl_vat=total_excl,
|
|
total_vat=total_vat,
|
|
total_incl_vat=total_incl,
|
|
confidence=confidence,
|
|
)
|
|
|
|
def _extract_breakdowns(self, text: str) -> list[VATBreakdown]:
|
|
"""Extract individual VAT rate breakdowns."""
|
|
breakdowns = []
|
|
seen_rates = set()
|
|
|
|
# Try pattern with base amount first
|
|
for match in self.VAT_WITH_BASE_PATTERN.finditer(text):
|
|
rate = self._parse_rate(match.group(1))
|
|
vat_amount = self._clean_amount(match.group(2))
|
|
base_amount = (
|
|
self._clean_amount(match.group(3)) if match.group(3) else None
|
|
)
|
|
|
|
if rate is not None and vat_amount and rate not in seen_rates:
|
|
seen_rates.add(rate)
|
|
breakdowns.append(
|
|
VATBreakdown(
|
|
rate=rate,
|
|
base_amount=base_amount,
|
|
vat_amount=vat_amount,
|
|
source="regex",
|
|
)
|
|
)
|
|
|
|
# Try other patterns
|
|
for pattern in self.VAT_PATTERNS:
|
|
for match in pattern.finditer(text):
|
|
rate = self._parse_rate(match.group(1))
|
|
vat_amount = self._clean_amount(match.group(2))
|
|
|
|
if rate is not None and vat_amount and rate not in seen_rates:
|
|
seen_rates.add(rate)
|
|
breakdowns.append(
|
|
VATBreakdown(
|
|
rate=rate,
|
|
base_amount=None,
|
|
vat_amount=vat_amount,
|
|
source="regex",
|
|
)
|
|
)
|
|
|
|
return breakdowns
|
|
|
|
def _extract_total_excl(self, text: str) -> str | None:
|
|
"""Extract total excluding VAT."""
|
|
# Look for specific patterns first
|
|
patterns = [
|
|
re.compile(r"[Ss]umma\s+exkl\.?\s*moms\s*:?\s*([\d\s,\.]+)"),
|
|
re.compile(r"[Nn]etto\s*:?\s*([\d\s,\.]+)"),
|
|
re.compile(r"[Ee]xkl\.?\s*moms\s*:?\s*([\d\s,\.]+)"),
|
|
]
|
|
|
|
for pattern in patterns:
|
|
match = pattern.search(text)
|
|
if match:
|
|
return self._clean_amount(match.group(1))
|
|
|
|
return None
|
|
|
|
def _extract_total_vat(self, text: str) -> str | None:
|
|
"""Extract total VAT amount."""
|
|
patterns = [
|
|
re.compile(r"[Ss]umma\s+moms\s*:?\s*([\d\s,\.]+)"),
|
|
re.compile(r"[Tt]otal(?:t)?\s+moms\s*:?\s*([\d\s,\.]+)"),
|
|
# Generic "Moms:" without percentage
|
|
re.compile(r"^[Mm]oms\s*:?\s*([\d\s,\.]+)", re.MULTILINE),
|
|
]
|
|
|
|
for pattern in patterns:
|
|
match = pattern.search(text)
|
|
if match:
|
|
return self._clean_amount(match.group(1))
|
|
|
|
return None
|
|
|
|
def _extract_total_incl(self, text: str) -> str | None:
|
|
"""Extract total including VAT."""
|
|
patterns = [
|
|
re.compile(r"[Ss]umma\s+inkl\.?\s*moms\s*:?\s*([\d\s,\.]+)"),
|
|
re.compile(r"[Tt]otal(?:t)?\s+att\s+betala\s*:?\s*([\d\s,\.]+)"),
|
|
re.compile(r"[Bb]rutto\s*:?\s*([\d\s,\.]+)"),
|
|
re.compile(r"[Aa]tt\s+betala\s*:?\s*([\d\s,\.]+)"),
|
|
]
|
|
|
|
for pattern in patterns:
|
|
match = pattern.search(text)
|
|
if match:
|
|
return self._clean_amount(match.group(1))
|
|
|
|
return None
|
|
|
|
def _parse_rate(self, rate_str: str) -> float | None:
|
|
"""Parse VAT rate string to float."""
|
|
try:
|
|
rate_str = rate_str.replace(",", ".")
|
|
return float(rate_str)
|
|
except (ValueError, TypeError):
|
|
return None
|
|
|
|
def _clean_amount(self, amount_str: str) -> str | None:
|
|
"""Clean and validate amount string."""
|
|
if not amount_str:
|
|
return None
|
|
|
|
cleaned = amount_str.strip()
|
|
|
|
# Remove trailing non-numeric chars (except comma/period)
|
|
cleaned = re.sub(r"[^\d\s,\.]+$", "", cleaned).strip()
|
|
|
|
if not cleaned:
|
|
return None
|
|
|
|
# Validate it parses as a number
|
|
if self.amount_parser.parse(cleaned) is None:
|
|
return None
|
|
|
|
return cleaned
|
|
|
|
def _calculate_confidence(
|
|
self,
|
|
breakdowns: list[VATBreakdown],
|
|
total_excl: str | None,
|
|
total_vat: str | None,
|
|
total_incl: str | None,
|
|
) -> float:
|
|
"""Calculate confidence score based on extracted data."""
|
|
score = 0.0
|
|
|
|
# Has VAT breakdowns
|
|
if breakdowns:
|
|
score += 0.3
|
|
|
|
# Has total excluding VAT
|
|
if total_excl:
|
|
score += 0.2
|
|
|
|
# Has total VAT
|
|
if total_vat:
|
|
score += 0.2
|
|
|
|
# Has total including VAT
|
|
if total_incl:
|
|
score += 0.15
|
|
|
|
# Mathematical consistency check
|
|
if total_excl and total_vat and total_incl:
|
|
excl = self.amount_parser.parse(total_excl)
|
|
vat = self.amount_parser.parse(total_vat)
|
|
incl = self.amount_parser.parse(total_incl)
|
|
|
|
if excl and vat and incl:
|
|
expected = excl + vat
|
|
if abs(expected - incl) < 0.02: # Allow 2 cent tolerance
|
|
score += 0.15
|
|
|
|
return min(score, 1.0)
|