Files
invoice-master-poc-v2/src/normalize/normalizers/amount_normalizer.py
2026-01-25 15:21:11 +01:00

131 lines
4.9 KiB
Python

"""
Amount Normalizer
Normalizes monetary amounts with various formats and separators.
"""
import re
from .base import BaseNormalizer
class AmountNormalizer(BaseNormalizer):
"""
Normalizes monetary amounts.
Handles Swedish and international formats with different
thousand/decimal separators.
Examples:
'114' -> ['114', '114,00', '114.00']
'114,00' -> ['114,00', '114.00', '114']
'1 234,56' -> ['1234,56', '1234.56', '1 234,56']
'3045 52' -> ['3045.52', '3045,52', '304552']
"""
def normalize(self, value: str) -> list[str]:
"""Generate variants of amount."""
value = self.clean_text(value)
# Remove currency symbols and common suffixes
value = re.sub(r'[SEK|kr|:-]+$', '', value, flags=re.IGNORECASE).strip()
variants = [value]
# Check for space as decimal separator: "3045 52"
space_decimal_match = re.match(r'^(\d+)\s+(\d{2})$', value)
if space_decimal_match:
integer_part = space_decimal_match.group(1)
decimal_part = space_decimal_match.group(2)
variants.append(f"{integer_part}.{decimal_part}")
variants.append(f"{integer_part},{decimal_part}")
variants.append(f"{integer_part}{decimal_part}")
# Check for space as thousand separator: "10 571,00"
space_thousand_match = re.match(r'^(\d{1,3})[\s\xa0]+(\d{3})([,\.])(\d{2})$', value)
if space_thousand_match:
part1 = space_thousand_match.group(1)
part2 = space_thousand_match.group(2)
sep = space_thousand_match.group(3)
decimal = space_thousand_match.group(4)
combined = f"{part1}{part2}"
variants.append(f"{combined}.{decimal}")
variants.append(f"{combined},{decimal}")
variants.append(f"{combined}{decimal}")
other_sep = ',' if sep == '.' else '.'
variants.append(f"{part1} {part2}{other_sep}{decimal}")
# Handle US format: "1,390.00"
us_format_match = re.match(r'^(\d{1,3}),(\d{3})\.(\d{2})$', value)
if us_format_match:
part1 = us_format_match.group(1)
part2 = us_format_match.group(2)
decimal = us_format_match.group(3)
combined = f"{part1}{part2}"
variants.append(f"{combined}.{decimal}")
variants.append(f"{combined},{decimal}")
variants.append(combined)
variants.append(f"{part1}.{part2},{decimal}")
# Handle European format: "1.390,00"
eu_format_match = re.match(r'^(\d{1,3})\.(\d{3}),(\d{2})$', value)
if eu_format_match:
part1 = eu_format_match.group(1)
part2 = eu_format_match.group(2)
decimal = eu_format_match.group(3)
combined = f"{part1}{part2}"
variants.append(f"{combined}.{decimal}")
variants.append(f"{combined},{decimal}")
variants.append(combined)
variants.append(f"{part1},{part2}.{decimal}")
# Remove spaces (thousand separators)
no_space = value.replace(' ', '').replace('\xa0', '')
# Normalize decimal separator
if ',' in no_space:
dot_version = no_space.replace(',', '.')
variants.append(no_space)
variants.append(dot_version)
elif '.' in no_space:
comma_version = no_space.replace('.', ',')
variants.append(no_space)
variants.append(comma_version)
else:
# Integer amount - add decimal versions
variants.append(no_space)
variants.append(f"{no_space},00")
variants.append(f"{no_space}.00")
# Try to parse and get clean numeric value
try:
clean = no_space.replace(',', '.')
num = float(clean)
# Integer if no decimals
if num == int(num):
int_val = int(num)
variants.append(str(int_val))
variants.append(f"{int_val},00")
variants.append(f"{int_val}.00")
# European format with dot as thousand separator
if int_val >= 1000:
formatted = f"{int_val:,}".replace(',', '.')
variants.append(formatted)
variants.append(f"{formatted},00")
else:
variants.append(f"{num:.2f}")
variants.append(f"{num:.2f}".replace('.', ','))
# European format with dot as thousand separator
if num >= 1000:
formatted_str = f"{num:.2f}"
int_str, dec_str = formatted_str.split(".")
int_part = int(int_str)
formatted_int = f"{int_part:,}".replace(',', '.')
variants.append(f"{formatted_int},{dec_str}")
except ValueError:
pass
return list(set(v for v in variants if v))