Update paddle, and support invoice line item
This commit is contained in:
350
packages/backend/backend/vat/vat_extractor.py
Normal file
350
packages/backend/backend/vat/vat_extractor.py
Normal file
@@ -0,0 +1,350 @@
|
||||
"""
|
||||
VAT Extractor
|
||||
|
||||
Extracts VAT (Moms) information from Swedish invoice text using regex patterns.
|
||||
Supports multiple VAT rates (25%, 12%, 6%, 0%) and various Swedish formats.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
import re
|
||||
from decimal import Decimal, InvalidOperation
|
||||
|
||||
|
||||
@dataclass
|
||||
class VATBreakdown:
|
||||
"""Single VAT rate breakdown."""
|
||||
|
||||
rate: float # 25.0, 12.0, 6.0, 0.0
|
||||
base_amount: str | None # Tax base (excl VAT)
|
||||
vat_amount: str # VAT amount
|
||||
source: str # 'regex' | 'line_items'
|
||||
|
||||
|
||||
@dataclass
|
||||
class VATSummary:
|
||||
"""Complete VAT summary."""
|
||||
|
||||
breakdowns: list[VATBreakdown]
|
||||
total_excl_vat: str | None
|
||||
total_vat: str | None
|
||||
total_incl_vat: str | None
|
||||
confidence: float
|
||||
|
||||
|
||||
class AmountParser:
|
||||
"""Parse Swedish and European number formats."""
|
||||
|
||||
# Patterns to clean amount strings
|
||||
CURRENCY_PATTERN = re.compile(r"(SEK|kr|:-)\s*", re.IGNORECASE)
|
||||
|
||||
def parse(self, amount_str: str) -> float | None:
|
||||
"""
|
||||
Parse amount string to float.
|
||||
|
||||
Handles:
|
||||
- Swedish: 1 234,56
|
||||
- European: 1.234,56
|
||||
- US: 1,234.56
|
||||
|
||||
Args:
|
||||
amount_str: Amount string to parse.
|
||||
|
||||
Returns:
|
||||
Parsed float value or None if invalid.
|
||||
"""
|
||||
if not amount_str or not amount_str.strip():
|
||||
return None
|
||||
|
||||
# Clean the string
|
||||
cleaned = amount_str.strip()
|
||||
|
||||
# Remove currency
|
||||
cleaned = self.CURRENCY_PATTERN.sub("", cleaned).strip()
|
||||
cleaned = re.sub(r"^SEK\s*", "", cleaned, flags=re.IGNORECASE)
|
||||
|
||||
if not cleaned:
|
||||
return None
|
||||
|
||||
# Check for negative
|
||||
is_negative = cleaned.startswith("-")
|
||||
if is_negative:
|
||||
cleaned = cleaned[1:].strip()
|
||||
|
||||
try:
|
||||
# Remove spaces (Swedish thousands separator)
|
||||
cleaned = cleaned.replace(" ", "")
|
||||
|
||||
# Detect format
|
||||
# Swedish/European: comma is decimal separator
|
||||
# US: period is decimal separator
|
||||
has_comma = "," in cleaned
|
||||
has_period = "." in cleaned
|
||||
|
||||
if has_comma and has_period:
|
||||
# Both present - check position
|
||||
comma_pos = cleaned.rfind(",")
|
||||
period_pos = cleaned.rfind(".")
|
||||
|
||||
if comma_pos > period_pos:
|
||||
# European: 1.234,56
|
||||
cleaned = cleaned.replace(".", "")
|
||||
cleaned = cleaned.replace(",", ".")
|
||||
else:
|
||||
# US: 1,234.56
|
||||
cleaned = cleaned.replace(",", "")
|
||||
elif has_comma:
|
||||
# Swedish: 1234,56
|
||||
cleaned = cleaned.replace(",", ".")
|
||||
# else: US format or integer
|
||||
|
||||
value = float(cleaned)
|
||||
return -value if is_negative else value
|
||||
|
||||
except (ValueError, InvalidOperation):
|
||||
return None
|
||||
|
||||
|
||||
class VATExtractor:
|
||||
"""Extract VAT information from invoice text."""
|
||||
|
||||
# VAT extraction patterns
|
||||
# Note: Amount pattern uses [^\n] to avoid crossing line boundaries
|
||||
VAT_PATTERNS = [
|
||||
# Moms 25%: 2 500,00 or Moms 25% 2 500,00
|
||||
re.compile(
|
||||
r"[Mm]oms\s*(\d+(?:[,\.]\d+)?)\s*%\s*:?\s*([\d ,\.]+?)(?:\s*$|\s+[a-zA-Z])",
|
||||
re.MULTILINE,
|
||||
),
|
||||
# Varav moms 25% 2 500,00
|
||||
re.compile(
|
||||
r"[Vv]arav\s+moms\s+(\d+(?:[,\.]\d+)?)\s*%\s*([\d ,\.]+?)(?:\s*$|\s+[a-zA-Z])",
|
||||
re.MULTILINE,
|
||||
),
|
||||
# 25% moms: 2 500,00 (at line start or after whitespace)
|
||||
re.compile(
|
||||
r"(?:^|\s)(\d+(?:[,\.]\d+)?)\s*%\s*moms\s*:?\s*([\d ,\.]+?)(?:\s*$|\s+[a-zA-Z])",
|
||||
re.MULTILINE,
|
||||
),
|
||||
# Moms (25%): 2 500,00
|
||||
re.compile(
|
||||
r"[Mm]oms\s*\((\d+(?:[,\.]\d+)?)\s*%\)\s*:?\s*([\d ,\.]+?)(?:\s*$|\s+[a-zA-Z])",
|
||||
re.MULTILINE,
|
||||
),
|
||||
]
|
||||
|
||||
# Pattern with base amount (Underlag)
|
||||
VAT_WITH_BASE_PATTERN = re.compile(
|
||||
r"[Mm]oms\s*(\d+(?:[,\.]\d+)?)\s*%\s*:?\s*([\d\s,\.]+)"
|
||||
r"(?:.*?[Uu]nderlag\s*([\d\s,\.]+))?",
|
||||
re.MULTILINE | re.DOTALL,
|
||||
)
|
||||
|
||||
# Total patterns
|
||||
TOTAL_EXCL_PATTERN = re.compile(
|
||||
r"(?:[Ss]umma|[Tt]otal(?:t)?|[Nn]etto)\s*(?:exkl\.?\s*)?(?:moms)?\s*:?\s*([\d\s,\.]+)",
|
||||
re.MULTILINE,
|
||||
)
|
||||
TOTAL_VAT_PATTERN = re.compile(
|
||||
r"(?:[Ss]umma|[Tt]otal(?:t)?)\s*moms\s*:?\s*([\d\s,\.]+)",
|
||||
re.MULTILINE,
|
||||
)
|
||||
TOTAL_INCL_PATTERN = re.compile(
|
||||
r"(?:[Ss]umma|[Tt]otal(?:t)?|[Bb]rutto)\s*(?:inkl\.?\s*)?(?:moms|att\s*betala)?\s*:?\s*([\d\s,\.]+)",
|
||||
re.MULTILINE,
|
||||
)
|
||||
|
||||
def __init__(self):
|
||||
self.amount_parser = AmountParser()
|
||||
|
||||
def extract(self, text: str) -> VATSummary:
|
||||
"""
|
||||
Extract VAT information from text.
|
||||
|
||||
Args:
|
||||
text: Invoice text (OCR output).
|
||||
|
||||
Returns:
|
||||
VATSummary with extracted information.
|
||||
"""
|
||||
if not text or not text.strip():
|
||||
return VATSummary(
|
||||
breakdowns=[],
|
||||
total_excl_vat=None,
|
||||
total_vat=None,
|
||||
total_incl_vat=None,
|
||||
confidence=0.0,
|
||||
)
|
||||
|
||||
breakdowns = self._extract_breakdowns(text)
|
||||
total_excl = self._extract_total_excl(text)
|
||||
total_vat = self._extract_total_vat(text)
|
||||
total_incl = self._extract_total_incl(text)
|
||||
|
||||
confidence = self._calculate_confidence(
|
||||
breakdowns, total_excl, total_vat, total_incl
|
||||
)
|
||||
|
||||
return VATSummary(
|
||||
breakdowns=breakdowns,
|
||||
total_excl_vat=total_excl,
|
||||
total_vat=total_vat,
|
||||
total_incl_vat=total_incl,
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
def _extract_breakdowns(self, text: str) -> list[VATBreakdown]:
|
||||
"""Extract individual VAT rate breakdowns."""
|
||||
breakdowns = []
|
||||
seen_rates = set()
|
||||
|
||||
# Try pattern with base amount first
|
||||
for match in self.VAT_WITH_BASE_PATTERN.finditer(text):
|
||||
rate = self._parse_rate(match.group(1))
|
||||
vat_amount = self._clean_amount(match.group(2))
|
||||
base_amount = (
|
||||
self._clean_amount(match.group(3)) if match.group(3) else None
|
||||
)
|
||||
|
||||
if rate is not None and vat_amount and rate not in seen_rates:
|
||||
seen_rates.add(rate)
|
||||
breakdowns.append(
|
||||
VATBreakdown(
|
||||
rate=rate,
|
||||
base_amount=base_amount,
|
||||
vat_amount=vat_amount,
|
||||
source="regex",
|
||||
)
|
||||
)
|
||||
|
||||
# Try other patterns
|
||||
for pattern in self.VAT_PATTERNS:
|
||||
for match in pattern.finditer(text):
|
||||
rate = self._parse_rate(match.group(1))
|
||||
vat_amount = self._clean_amount(match.group(2))
|
||||
|
||||
if rate is not None and vat_amount and rate not in seen_rates:
|
||||
seen_rates.add(rate)
|
||||
breakdowns.append(
|
||||
VATBreakdown(
|
||||
rate=rate,
|
||||
base_amount=None,
|
||||
vat_amount=vat_amount,
|
||||
source="regex",
|
||||
)
|
||||
)
|
||||
|
||||
return breakdowns
|
||||
|
||||
def _extract_total_excl(self, text: str) -> str | None:
|
||||
"""Extract total excluding VAT."""
|
||||
# Look for specific patterns first
|
||||
patterns = [
|
||||
re.compile(r"[Ss]umma\s+exkl\.?\s*moms\s*:?\s*([\d\s,\.]+)"),
|
||||
re.compile(r"[Nn]etto\s*:?\s*([\d\s,\.]+)"),
|
||||
re.compile(r"[Ee]xkl\.?\s*moms\s*:?\s*([\d\s,\.]+)"),
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = pattern.search(text)
|
||||
if match:
|
||||
return self._clean_amount(match.group(1))
|
||||
|
||||
return None
|
||||
|
||||
def _extract_total_vat(self, text: str) -> str | None:
|
||||
"""Extract total VAT amount."""
|
||||
patterns = [
|
||||
re.compile(r"[Ss]umma\s+moms\s*:?\s*([\d\s,\.]+)"),
|
||||
re.compile(r"[Tt]otal(?:t)?\s+moms\s*:?\s*([\d\s,\.]+)"),
|
||||
# Generic "Moms:" without percentage
|
||||
re.compile(r"^[Mm]oms\s*:?\s*([\d\s,\.]+)", re.MULTILINE),
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = pattern.search(text)
|
||||
if match:
|
||||
return self._clean_amount(match.group(1))
|
||||
|
||||
return None
|
||||
|
||||
def _extract_total_incl(self, text: str) -> str | None:
|
||||
"""Extract total including VAT."""
|
||||
patterns = [
|
||||
re.compile(r"[Ss]umma\s+inkl\.?\s*moms\s*:?\s*([\d\s,\.]+)"),
|
||||
re.compile(r"[Tt]otal(?:t)?\s+att\s+betala\s*:?\s*([\d\s,\.]+)"),
|
||||
re.compile(r"[Bb]rutto\s*:?\s*([\d\s,\.]+)"),
|
||||
re.compile(r"[Aa]tt\s+betala\s*:?\s*([\d\s,\.]+)"),
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = pattern.search(text)
|
||||
if match:
|
||||
return self._clean_amount(match.group(1))
|
||||
|
||||
return None
|
||||
|
||||
def _parse_rate(self, rate_str: str) -> float | None:
|
||||
"""Parse VAT rate string to float."""
|
||||
try:
|
||||
rate_str = rate_str.replace(",", ".")
|
||||
return float(rate_str)
|
||||
except (ValueError, TypeError):
|
||||
return None
|
||||
|
||||
def _clean_amount(self, amount_str: str) -> str | None:
|
||||
"""Clean and validate amount string."""
|
||||
if not amount_str:
|
||||
return None
|
||||
|
||||
cleaned = amount_str.strip()
|
||||
|
||||
# Remove trailing non-numeric chars (except comma/period)
|
||||
cleaned = re.sub(r"[^\d\s,\.]+$", "", cleaned).strip()
|
||||
|
||||
if not cleaned:
|
||||
return None
|
||||
|
||||
# Validate it parses as a number
|
||||
if self.amount_parser.parse(cleaned) is None:
|
||||
return None
|
||||
|
||||
return cleaned
|
||||
|
||||
def _calculate_confidence(
|
||||
self,
|
||||
breakdowns: list[VATBreakdown],
|
||||
total_excl: str | None,
|
||||
total_vat: str | None,
|
||||
total_incl: str | None,
|
||||
) -> float:
|
||||
"""Calculate confidence score based on extracted data."""
|
||||
score = 0.0
|
||||
|
||||
# Has VAT breakdowns
|
||||
if breakdowns:
|
||||
score += 0.3
|
||||
|
||||
# Has total excluding VAT
|
||||
if total_excl:
|
||||
score += 0.2
|
||||
|
||||
# Has total VAT
|
||||
if total_vat:
|
||||
score += 0.2
|
||||
|
||||
# Has total including VAT
|
||||
if total_incl:
|
||||
score += 0.15
|
||||
|
||||
# Mathematical consistency check
|
||||
if total_excl and total_vat and total_incl:
|
||||
excl = self.amount_parser.parse(total_excl)
|
||||
vat = self.amount_parser.parse(total_vat)
|
||||
incl = self.amount_parser.parse(total_incl)
|
||||
|
||||
if excl and vat and incl:
|
||||
expected = excl + vat
|
||||
if abs(expected - incl) < 0.02: # Allow 2 cent tolerance
|
||||
score += 0.15
|
||||
|
||||
return min(score, 1.0)
|
||||
Reference in New Issue
Block a user