Files
invoice-master-poc-v2/packages/backend/backend/pipeline/normalizers/amount.py
Yaojia Wang ad5ed46b4c WIP
2026-02-11 23:40:38 +01:00

258 lines
9.6 KiB
Python

"""
Amount Normalizer
Handles normalization and validation of monetary amounts.
"""
import re
from shared.utils.text_cleaner import TextCleaner
from shared.utils.validators import FieldValidators
from shared.utils.ocr_corrections import OCRCorrections
from .base import BaseNormalizer, NormalizationResult
class AmountNormalizer(BaseNormalizer):
"""
Normalizes monetary amounts from Swedish invoices.
Handles various Swedish amount formats:
- With decimal: 1 234,56 kr
- With SEK suffix: 1234.56 SEK
- Payment line kronor/ore: 590 00 (space = decimal separator)
- Multiple amounts (returns the last one, usually the total)
"""
# Payment line kronor/ore pattern: "590 00" means 590.00 SEK
# Only matches when no comma/dot is present (pure digit-space-2digit format)
_KRONOR_ORE_PATTERN = re.compile(r'^(\d+)\s+(\d{2})$')
@property
def field_name(self) -> str:
return "Amount"
@classmethod
def _try_kronor_ore(cls, text: str) -> NormalizationResult | None:
"""Try to parse as payment line kronor/ore format.
Swedish payment lines separate kronor and ore with a space:
"590 00" = 590.00 SEK, "15658 00" = 15658.00 SEK
Only applies when text has no comma or dot (otherwise it's
a normal amount format with explicit decimal separator).
Returns NormalizationResult on success, None if not matched.
"""
if ',' in text or '.' in text:
return None
match = cls._KRONOR_ORE_PATTERN.match(text.strip())
if not match:
return None
kronor = match.group(1)
ore = match.group(2)
try:
amount = float(f"{kronor}.{ore}")
if amount > 0:
return NormalizationResult.success(f"{amount:.2f}")
except ValueError:
pass
return None
@staticmethod
def _parse_amount_str(match: str) -> float | None:
"""Convert matched amount string to float, detecting European vs Anglo format.
European: 2.254,50 -> 2254.50 (dot=thousand, comma=decimal)
Anglo: 1,234.56 -> 1234.56 (comma=thousand, dot=decimal)
Swedish: 1 234,56 -> 1234.56 (space=thousand, comma=decimal)
"""
has_comma = ',' in match
has_dot = '.' in match
if has_comma and has_dot:
if match.rfind(',') > match.rfind('.'):
# European: 2.254,50
cleaned = match.replace(" ", "").replace(".", "").replace(",", ".")
else:
# Anglo: 1,234.56
cleaned = match.replace(" ", "").replace(",", "")
elif has_comma:
cleaned = match.replace(" ", "").replace(",", ".")
else:
cleaned = match.replace(" ", "")
try:
return float(cleaned)
except ValueError:
return None
def normalize(self, text: str) -> NormalizationResult:
text = text.strip()
if not text:
return NormalizationResult.failure("Empty text")
# Early check: payment line kronor/ore format ("590 00" → 590.00)
kronor_ore_result = self._try_kronor_ore(text)
if kronor_ore_result is not None:
return kronor_ore_result
# Split by newlines and process line by line to get the last valid amount
lines = text.split("\n")
# Collect all valid amounts from all lines
all_amounts: list[float] = []
# Separate patterns for European and Anglo formats
# (?!\d) lookahead prevents partial matches (e.g. "1,23" in "1,234.56")
# European: dot=thousand, comma=decimal (2.254,50 or 1 234,56)
# Anglo: comma=thousand, dot=decimal (1,234.56 or 1234.56)
amount_pattern = (
r"(\d[\d\s.]*,\d{2})(?!\d)\s*(?:kr|SEK)?"
r"|"
r"(\d[\d\s,]*\.\d{2})(?!\d)\s*(?:kr|SEK)?"
)
for line in lines:
line = line.strip()
if not line:
continue
# Find all amounts in this line
for m in re.finditer(amount_pattern, line, re.IGNORECASE):
match = m.group(1) or m.group(2)
if not match:
continue
amount = self._parse_amount_str(match)
if amount is not None and amount > 0:
all_amounts.append(amount)
# Return the last amount found (usually the total)
if all_amounts:
return NormalizationResult.success(f"{all_amounts[-1]:.2f}")
# Fallback: try shared validator on cleaned text
cleaned = TextCleaner.normalize_amount_text(text)
amount = FieldValidators.parse_amount(cleaned)
if amount is not None and amount > 0:
return NormalizationResult.success(f"{amount:.2f}")
# Try to find any decimal number
simple_pattern = r"(\d+[,\.]\d{2})"
matches = re.findall(simple_pattern, text)
if matches:
amount_str = matches[-1].replace(",", ".")
try:
amount = float(amount_str)
if amount > 0:
return NormalizationResult.success(f"{amount:.2f}")
except ValueError:
pass
# Last resort: try to find integer amount (no decimals)
# Look for patterns like "Amount: 11699" or standalone numbers
int_pattern = r"(?:amount|belopp|summa|total)[:\s]*(\d+)"
match = re.search(int_pattern, text, re.IGNORECASE)
if match:
try:
amount = float(match.group(1))
if amount > 0:
return NormalizationResult.success(f"{amount:.2f}")
except ValueError:
pass
# Very last resort: find any standalone number >= 3 digits
standalone_pattern = r"\b(\d{3,})\b"
matches = re.findall(standalone_pattern, text)
if matches:
# Take the last/largest number
try:
amount = float(matches[-1])
if amount > 0:
return NormalizationResult.success(f"{amount:.2f}")
except ValueError:
pass
return NormalizationResult.failure(f"Cannot parse amount: {text}")
class EnhancedAmountNormalizer(AmountNormalizer):
"""
Enhanced amount parsing with multiple strategies.
Strategies:
1. Pattern matching for Swedish formats
2. Context-aware extraction (look for keywords like "Total", "Summa")
3. OCR error correction for common digit errors
4. Multi-amount handling (prefer last/largest as total)
"""
def normalize(self, text: str) -> NormalizationResult:
text = text.strip()
if not text:
return NormalizationResult.failure("Empty text")
# Early check: payment line kronor/ore format ("590 00" → 590.00)
kronor_ore_result = self._try_kronor_ore(text)
if kronor_ore_result is not None:
return kronor_ore_result
# Strategy 1: Apply OCR corrections first
corrected_text = OCRCorrections.correct_digits(text, aggressive=False).corrected
# Strategy 2: Look for labeled amounts (highest priority)
# Use two capture groups: group(1) = European, group(2) = Anglo
labeled_patterns = [
# Swedish patterns ((?!\d) prevents partial matches like "1,23" in "1,234.56")
(r"(?:att\s+betala|summa|total|belopp)\s*[:\s]*(\d[\d\s.]*,\d{2}(?!\d)|\d[\d\s,]*\.\d{2}(?!\d))", 1.0),
(
r"(?:moms|vat)\s*[:\s]*(\d[\d\s.]*,\d{2}(?!\d)|\d[\d\s,]*\.\d{2}(?!\d))",
0.8,
), # Lower priority for VAT
# Generic pattern
(r"(\d[\d\s.]*,\d{2}(?!\d)|\d[\d\s,]*\.\d{2}(?!\d))\s*(?:kr|sek|kronor)?", 0.7),
]
candidates: list[tuple[float, float, int]] = []
for pattern, priority in labeled_patterns:
for match in re.finditer(pattern, corrected_text, re.IGNORECASE):
amount = self._parse_amount_str(match.group(1))
if amount is not None and 0 < amount < 10_000_000:
candidates.append((amount, priority, match.start()))
if candidates:
# Sort by priority (desc), then by position (later is usually total)
candidates.sort(key=lambda x: (-x[1], -x[2]))
best_amount = candidates[0][0]
return NormalizationResult.success(f"{best_amount:.2f}")
# Strategy 3: Parse with shared validator
cleaned = TextCleaner.normalize_amount_text(corrected_text)
amount = FieldValidators.parse_amount(cleaned)
if amount is not None and 0 < amount < 10_000_000:
return NormalizationResult.success(f"{amount:.2f}")
# Strategy 4: Try to extract any decimal number as fallback
decimal_pattern = r"(\d{1,3}(?:[\s\.]?\d{3})*[,\.]\d{2})"
matches = re.findall(decimal_pattern, corrected_text)
if matches:
# Clean and parse each match
amounts: list[float] = []
for m in matches:
cleaned_m = m.replace(" ", "").replace(".", "").replace(",", ".")
# Handle Swedish format: "1 234,56" -> "1234.56"
if "," in m and "." not in m:
cleaned_m = m.replace(" ", "").replace(",", ".")
try:
amt = float(cleaned_m)
if 0 < amt < 10_000_000:
amounts.append(amt)
except ValueError:
continue
if amounts:
# Return the last/largest amount (usually the total)
return NormalizationResult.success(f"{max(amounts):.2f}")
return NormalizationResult.failure(f"Cannot parse amount: {text[:50]}")