""" Amount Normalizer Normalizes monetary amounts with various formats and separators. """ import re from .base import BaseNormalizer class AmountNormalizer(BaseNormalizer): """ Normalizes monetary amounts. Handles Swedish and international formats with different thousand/decimal separators. Examples: '114' -> ['114', '114,00', '114.00'] '114,00' -> ['114,00', '114.00', '114'] '1 234,56' -> ['1234,56', '1234.56', '1 234,56'] '3045 52' -> ['3045.52', '3045,52', '304552'] """ def normalize(self, value: str) -> list[str]: """Generate variants of amount.""" value = self.clean_text(value) # Remove currency symbols and common suffixes value = re.sub(r'[SEK|kr|:-]+$', '', value, flags=re.IGNORECASE).strip() variants = [value] # Check for space as decimal separator: "3045 52" space_decimal_match = re.match(r'^(\d+)\s+(\d{2})$', value) if space_decimal_match: integer_part = space_decimal_match.group(1) decimal_part = space_decimal_match.group(2) variants.append(f"{integer_part}.{decimal_part}") variants.append(f"{integer_part},{decimal_part}") variants.append(f"{integer_part}{decimal_part}") # Check for space as thousand separator: "10 571,00" space_thousand_match = re.match(r'^(\d{1,3})[\s\xa0]+(\d{3})([,\.])(\d{2})$', value) if space_thousand_match: part1 = space_thousand_match.group(1) part2 = space_thousand_match.group(2) sep = space_thousand_match.group(3) decimal = space_thousand_match.group(4) combined = f"{part1}{part2}" variants.append(f"{combined}.{decimal}") variants.append(f"{combined},{decimal}") variants.append(f"{combined}{decimal}") other_sep = ',' if sep == '.' else '.' variants.append(f"{part1} {part2}{other_sep}{decimal}") # Handle US format: "1,390.00" us_format_match = re.match(r'^(\d{1,3}),(\d{3})\.(\d{2})$', value) if us_format_match: part1 = us_format_match.group(1) part2 = us_format_match.group(2) decimal = us_format_match.group(3) combined = f"{part1}{part2}" variants.append(f"{combined}.{decimal}") variants.append(f"{combined},{decimal}") variants.append(combined) variants.append(f"{part1}.{part2},{decimal}") # Handle European format: "1.390,00" eu_format_match = re.match(r'^(\d{1,3})\.(\d{3}),(\d{2})$', value) if eu_format_match: part1 = eu_format_match.group(1) part2 = eu_format_match.group(2) decimal = eu_format_match.group(3) combined = f"{part1}{part2}" variants.append(f"{combined}.{decimal}") variants.append(f"{combined},{decimal}") variants.append(combined) variants.append(f"{part1},{part2}.{decimal}") # Remove spaces (thousand separators) no_space = value.replace(' ', '').replace('\xa0', '') # Normalize decimal separator if ',' in no_space: dot_version = no_space.replace(',', '.') variants.append(no_space) variants.append(dot_version) elif '.' in no_space: comma_version = no_space.replace('.', ',') variants.append(no_space) variants.append(comma_version) else: # Integer amount - add decimal versions variants.append(no_space) variants.append(f"{no_space},00") variants.append(f"{no_space}.00") # Try to parse and get clean numeric value try: clean = no_space.replace(',', '.') num = float(clean) # Integer if no decimals if num == int(num): int_val = int(num) variants.append(str(int_val)) variants.append(f"{int_val},00") variants.append(f"{int_val}.00") # European format with dot as thousand separator if int_val >= 1000: formatted = f"{int_val:,}".replace(',', '.') variants.append(formatted) variants.append(f"{formatted},00") else: variants.append(f"{num:.2f}") variants.append(f"{num:.2f}".replace('.', ',')) # European format with dot as thousand separator if num >= 1000: formatted_str = f"{num:.2f}" int_str, dec_str = formatted_str.split(".") int_part = int(int_str) formatted_int = f"{int_part:,}".replace(',', '.') variants.append(f"{formatted_int},{dec_str}") except ValueError: pass return list(set(v for v in variants if v))