258 lines
9.6 KiB
Python
258 lines
9.6 KiB
Python
"""
|
|
Amount Normalizer
|
|
|
|
Handles normalization and validation of monetary amounts.
|
|
"""
|
|
|
|
import re
|
|
|
|
from shared.utils.text_cleaner import TextCleaner
|
|
from shared.utils.validators import FieldValidators
|
|
from shared.utils.ocr_corrections import OCRCorrections
|
|
|
|
from .base import BaseNormalizer, NormalizationResult
|
|
|
|
|
|
class AmountNormalizer(BaseNormalizer):
|
|
"""
|
|
Normalizes monetary amounts from Swedish invoices.
|
|
|
|
Handles various Swedish amount formats:
|
|
- With decimal: 1 234,56 kr
|
|
- With SEK suffix: 1234.56 SEK
|
|
- Payment line kronor/ore: 590 00 (space = decimal separator)
|
|
- Multiple amounts (returns the last one, usually the total)
|
|
"""
|
|
|
|
# Payment line kronor/ore pattern: "590 00" means 590.00 SEK
|
|
# Only matches when no comma/dot is present (pure digit-space-2digit format)
|
|
_KRONOR_ORE_PATTERN = re.compile(r'^(\d+)\s+(\d{2})$')
|
|
|
|
@property
|
|
def field_name(self) -> str:
|
|
return "Amount"
|
|
|
|
@classmethod
|
|
def _try_kronor_ore(cls, text: str) -> NormalizationResult | None:
|
|
"""Try to parse as payment line kronor/ore format.
|
|
|
|
Swedish payment lines separate kronor and ore with a space:
|
|
"590 00" = 590.00 SEK, "15658 00" = 15658.00 SEK
|
|
|
|
Only applies when text has no comma or dot (otherwise it's
|
|
a normal amount format with explicit decimal separator).
|
|
|
|
Returns NormalizationResult on success, None if not matched.
|
|
"""
|
|
if ',' in text or '.' in text:
|
|
return None
|
|
|
|
match = cls._KRONOR_ORE_PATTERN.match(text.strip())
|
|
if not match:
|
|
return None
|
|
|
|
kronor = match.group(1)
|
|
ore = match.group(2)
|
|
try:
|
|
amount = float(f"{kronor}.{ore}")
|
|
if amount > 0:
|
|
return NormalizationResult.success(f"{amount:.2f}")
|
|
except ValueError:
|
|
pass
|
|
return None
|
|
|
|
@staticmethod
|
|
def _parse_amount_str(match: str) -> float | None:
|
|
"""Convert matched amount string to float, detecting European vs Anglo format.
|
|
|
|
European: 2.254,50 -> 2254.50 (dot=thousand, comma=decimal)
|
|
Anglo: 1,234.56 -> 1234.56 (comma=thousand, dot=decimal)
|
|
Swedish: 1 234,56 -> 1234.56 (space=thousand, comma=decimal)
|
|
"""
|
|
has_comma = ',' in match
|
|
has_dot = '.' in match
|
|
if has_comma and has_dot:
|
|
if match.rfind(',') > match.rfind('.'):
|
|
# European: 2.254,50
|
|
cleaned = match.replace(" ", "").replace(".", "").replace(",", ".")
|
|
else:
|
|
# Anglo: 1,234.56
|
|
cleaned = match.replace(" ", "").replace(",", "")
|
|
elif has_comma:
|
|
cleaned = match.replace(" ", "").replace(",", ".")
|
|
else:
|
|
cleaned = match.replace(" ", "")
|
|
try:
|
|
return float(cleaned)
|
|
except ValueError:
|
|
return None
|
|
|
|
def normalize(self, text: str) -> NormalizationResult:
|
|
text = text.strip()
|
|
if not text:
|
|
return NormalizationResult.failure("Empty text")
|
|
|
|
# Early check: payment line kronor/ore format ("590 00" → 590.00)
|
|
kronor_ore_result = self._try_kronor_ore(text)
|
|
if kronor_ore_result is not None:
|
|
return kronor_ore_result
|
|
|
|
# Split by newlines and process line by line to get the last valid amount
|
|
lines = text.split("\n")
|
|
|
|
# Collect all valid amounts from all lines
|
|
all_amounts: list[float] = []
|
|
|
|
# Separate patterns for European and Anglo formats
|
|
# (?!\d) lookahead prevents partial matches (e.g. "1,23" in "1,234.56")
|
|
# European: dot=thousand, comma=decimal (2.254,50 or 1 234,56)
|
|
# Anglo: comma=thousand, dot=decimal (1,234.56 or 1234.56)
|
|
amount_pattern = (
|
|
r"(\d[\d\s.]*,\d{2})(?!\d)\s*(?:kr|SEK)?"
|
|
r"|"
|
|
r"(\d[\d\s,]*\.\d{2})(?!\d)\s*(?:kr|SEK)?"
|
|
)
|
|
|
|
for line in lines:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
# Find all amounts in this line
|
|
for m in re.finditer(amount_pattern, line, re.IGNORECASE):
|
|
match = m.group(1) or m.group(2)
|
|
if not match:
|
|
continue
|
|
amount = self._parse_amount_str(match)
|
|
if amount is not None and amount > 0:
|
|
all_amounts.append(amount)
|
|
|
|
# Return the last amount found (usually the total)
|
|
if all_amounts:
|
|
return NormalizationResult.success(f"{all_amounts[-1]:.2f}")
|
|
|
|
# Fallback: try shared validator on cleaned text
|
|
cleaned = TextCleaner.normalize_amount_text(text)
|
|
amount = FieldValidators.parse_amount(cleaned)
|
|
if amount is not None and amount > 0:
|
|
return NormalizationResult.success(f"{amount:.2f}")
|
|
|
|
# Try to find any decimal number
|
|
simple_pattern = r"(\d+[,\.]\d{2})"
|
|
matches = re.findall(simple_pattern, text)
|
|
if matches:
|
|
amount_str = matches[-1].replace(",", ".")
|
|
try:
|
|
amount = float(amount_str)
|
|
if amount > 0:
|
|
return NormalizationResult.success(f"{amount:.2f}")
|
|
except ValueError:
|
|
pass
|
|
|
|
# Last resort: try to find integer amount (no decimals)
|
|
# Look for patterns like "Amount: 11699" or standalone numbers
|
|
int_pattern = r"(?:amount|belopp|summa|total)[:\s]*(\d+)"
|
|
match = re.search(int_pattern, text, re.IGNORECASE)
|
|
if match:
|
|
try:
|
|
amount = float(match.group(1))
|
|
if amount > 0:
|
|
return NormalizationResult.success(f"{amount:.2f}")
|
|
except ValueError:
|
|
pass
|
|
|
|
# Very last resort: find any standalone number >= 3 digits
|
|
standalone_pattern = r"\b(\d{3,})\b"
|
|
matches = re.findall(standalone_pattern, text)
|
|
if matches:
|
|
# Take the last/largest number
|
|
try:
|
|
amount = float(matches[-1])
|
|
if amount > 0:
|
|
return NormalizationResult.success(f"{amount:.2f}")
|
|
except ValueError:
|
|
pass
|
|
|
|
return NormalizationResult.failure(f"Cannot parse amount: {text}")
|
|
|
|
|
|
class EnhancedAmountNormalizer(AmountNormalizer):
|
|
"""
|
|
Enhanced amount parsing with multiple strategies.
|
|
|
|
Strategies:
|
|
1. Pattern matching for Swedish formats
|
|
2. Context-aware extraction (look for keywords like "Total", "Summa")
|
|
3. OCR error correction for common digit errors
|
|
4. Multi-amount handling (prefer last/largest as total)
|
|
"""
|
|
|
|
def normalize(self, text: str) -> NormalizationResult:
|
|
text = text.strip()
|
|
if not text:
|
|
return NormalizationResult.failure("Empty text")
|
|
|
|
# Early check: payment line kronor/ore format ("590 00" → 590.00)
|
|
kronor_ore_result = self._try_kronor_ore(text)
|
|
if kronor_ore_result is not None:
|
|
return kronor_ore_result
|
|
|
|
# Strategy 1: Apply OCR corrections first
|
|
corrected_text = OCRCorrections.correct_digits(text, aggressive=False).corrected
|
|
|
|
# Strategy 2: Look for labeled amounts (highest priority)
|
|
# Use two capture groups: group(1) = European, group(2) = Anglo
|
|
labeled_patterns = [
|
|
# Swedish patterns ((?!\d) prevents partial matches like "1,23" in "1,234.56")
|
|
(r"(?:att\s+betala|summa|total|belopp)\s*[:\s]*(\d[\d\s.]*,\d{2}(?!\d)|\d[\d\s,]*\.\d{2}(?!\d))", 1.0),
|
|
(
|
|
r"(?:moms|vat)\s*[:\s]*(\d[\d\s.]*,\d{2}(?!\d)|\d[\d\s,]*\.\d{2}(?!\d))",
|
|
0.8,
|
|
), # Lower priority for VAT
|
|
# Generic pattern
|
|
(r"(\d[\d\s.]*,\d{2}(?!\d)|\d[\d\s,]*\.\d{2}(?!\d))\s*(?:kr|sek|kronor)?", 0.7),
|
|
]
|
|
|
|
candidates: list[tuple[float, float, int]] = []
|
|
for pattern, priority in labeled_patterns:
|
|
for match in re.finditer(pattern, corrected_text, re.IGNORECASE):
|
|
amount = self._parse_amount_str(match.group(1))
|
|
if amount is not None and 0 < amount < 10_000_000:
|
|
candidates.append((amount, priority, match.start()))
|
|
|
|
if candidates:
|
|
# Sort by priority (desc), then by position (later is usually total)
|
|
candidates.sort(key=lambda x: (-x[1], -x[2]))
|
|
best_amount = candidates[0][0]
|
|
return NormalizationResult.success(f"{best_amount:.2f}")
|
|
|
|
# Strategy 3: Parse with shared validator
|
|
cleaned = TextCleaner.normalize_amount_text(corrected_text)
|
|
amount = FieldValidators.parse_amount(cleaned)
|
|
if amount is not None and 0 < amount < 10_000_000:
|
|
return NormalizationResult.success(f"{amount:.2f}")
|
|
|
|
# Strategy 4: Try to extract any decimal number as fallback
|
|
decimal_pattern = r"(\d{1,3}(?:[\s\.]?\d{3})*[,\.]\d{2})"
|
|
matches = re.findall(decimal_pattern, corrected_text)
|
|
if matches:
|
|
# Clean and parse each match
|
|
amounts: list[float] = []
|
|
for m in matches:
|
|
cleaned_m = m.replace(" ", "").replace(".", "").replace(",", ".")
|
|
# Handle Swedish format: "1 234,56" -> "1234.56"
|
|
if "," in m and "." not in m:
|
|
cleaned_m = m.replace(" ", "").replace(",", ".")
|
|
try:
|
|
amt = float(cleaned_m)
|
|
if 0 < amt < 10_000_000:
|
|
amounts.append(amt)
|
|
except ValueError:
|
|
continue
|
|
|
|
if amounts:
|
|
# Return the last/largest amount (usually the total)
|
|
return NormalizationResult.success(f"{max(amounts):.2f}")
|
|
|
|
return NormalizationResult.failure(f"Cannot parse amount: {text[:50]}")
|