131 lines
4.9 KiB
Python
131 lines
4.9 KiB
Python
"""
|
|
Amount Normalizer
|
|
|
|
Normalizes monetary amounts with various formats and separators.
|
|
"""
|
|
|
|
import re
|
|
from .base import BaseNormalizer
|
|
|
|
|
|
class AmountNormalizer(BaseNormalizer):
|
|
"""
|
|
Normalizes monetary amounts.
|
|
|
|
Handles Swedish and international formats with different
|
|
thousand/decimal separators.
|
|
|
|
Examples:
|
|
'114' -> ['114', '114,00', '114.00']
|
|
'114,00' -> ['114,00', '114.00', '114']
|
|
'1 234,56' -> ['1234,56', '1234.56', '1 234,56']
|
|
'3045 52' -> ['3045.52', '3045,52', '304552']
|
|
"""
|
|
|
|
def normalize(self, value: str) -> list[str]:
|
|
"""Generate variants of amount."""
|
|
value = self.clean_text(value)
|
|
|
|
# Remove currency symbols and common suffixes
|
|
value = re.sub(r'[SEK|kr|:-]+$', '', value, flags=re.IGNORECASE).strip()
|
|
|
|
variants = [value]
|
|
|
|
# Check for space as decimal separator: "3045 52"
|
|
space_decimal_match = re.match(r'^(\d+)\s+(\d{2})$', value)
|
|
if space_decimal_match:
|
|
integer_part = space_decimal_match.group(1)
|
|
decimal_part = space_decimal_match.group(2)
|
|
variants.append(f"{integer_part}.{decimal_part}")
|
|
variants.append(f"{integer_part},{decimal_part}")
|
|
variants.append(f"{integer_part}{decimal_part}")
|
|
|
|
# Check for space as thousand separator: "10 571,00"
|
|
space_thousand_match = re.match(r'^(\d{1,3})[\s\xa0]+(\d{3})([,\.])(\d{2})$', value)
|
|
if space_thousand_match:
|
|
part1 = space_thousand_match.group(1)
|
|
part2 = space_thousand_match.group(2)
|
|
sep = space_thousand_match.group(3)
|
|
decimal = space_thousand_match.group(4)
|
|
combined = f"{part1}{part2}"
|
|
variants.append(f"{combined}.{decimal}")
|
|
variants.append(f"{combined},{decimal}")
|
|
variants.append(f"{combined}{decimal}")
|
|
other_sep = ',' if sep == '.' else '.'
|
|
variants.append(f"{part1} {part2}{other_sep}{decimal}")
|
|
|
|
# Handle US format: "1,390.00"
|
|
us_format_match = re.match(r'^(\d{1,3}),(\d{3})\.(\d{2})$', value)
|
|
if us_format_match:
|
|
part1 = us_format_match.group(1)
|
|
part2 = us_format_match.group(2)
|
|
decimal = us_format_match.group(3)
|
|
combined = f"{part1}{part2}"
|
|
variants.append(f"{combined}.{decimal}")
|
|
variants.append(f"{combined},{decimal}")
|
|
variants.append(combined)
|
|
variants.append(f"{part1}.{part2},{decimal}")
|
|
|
|
# Handle European format: "1.390,00"
|
|
eu_format_match = re.match(r'^(\d{1,3})\.(\d{3}),(\d{2})$', value)
|
|
if eu_format_match:
|
|
part1 = eu_format_match.group(1)
|
|
part2 = eu_format_match.group(2)
|
|
decimal = eu_format_match.group(3)
|
|
combined = f"{part1}{part2}"
|
|
variants.append(f"{combined}.{decimal}")
|
|
variants.append(f"{combined},{decimal}")
|
|
variants.append(combined)
|
|
variants.append(f"{part1},{part2}.{decimal}")
|
|
|
|
# Remove spaces (thousand separators)
|
|
no_space = value.replace(' ', '').replace('\xa0', '')
|
|
|
|
# Normalize decimal separator
|
|
if ',' in no_space:
|
|
dot_version = no_space.replace(',', '.')
|
|
variants.append(no_space)
|
|
variants.append(dot_version)
|
|
elif '.' in no_space:
|
|
comma_version = no_space.replace('.', ',')
|
|
variants.append(no_space)
|
|
variants.append(comma_version)
|
|
else:
|
|
# Integer amount - add decimal versions
|
|
variants.append(no_space)
|
|
variants.append(f"{no_space},00")
|
|
variants.append(f"{no_space}.00")
|
|
|
|
# Try to parse and get clean numeric value
|
|
try:
|
|
clean = no_space.replace(',', '.')
|
|
num = float(clean)
|
|
|
|
# Integer if no decimals
|
|
if num == int(num):
|
|
int_val = int(num)
|
|
variants.append(str(int_val))
|
|
variants.append(f"{int_val},00")
|
|
variants.append(f"{int_val}.00")
|
|
|
|
# European format with dot as thousand separator
|
|
if int_val >= 1000:
|
|
formatted = f"{int_val:,}".replace(',', '.')
|
|
variants.append(formatted)
|
|
variants.append(f"{formatted},00")
|
|
else:
|
|
variants.append(f"{num:.2f}")
|
|
variants.append(f"{num:.2f}".replace('.', ','))
|
|
|
|
# European format with dot as thousand separator
|
|
if num >= 1000:
|
|
formatted_str = f"{num:.2f}"
|
|
int_str, dec_str = formatted_str.split(".")
|
|
int_part = int(int_str)
|
|
formatted_int = f"{int_part:,}".replace(',', '.')
|
|
variants.append(f"{formatted_int},{dec_str}")
|
|
except ValueError:
|
|
pass
|
|
|
|
return list(set(v for v in variants if v))
|