Re-structure the project.
This commit is contained in:
130
src/normalize/normalizers/amount_normalizer.py
Normal file
130
src/normalize/normalizers/amount_normalizer.py
Normal file
@@ -0,0 +1,130 @@
|
||||
"""
|
||||
Amount Normalizer
|
||||
|
||||
Normalizes monetary amounts with various formats and separators.
|
||||
"""
|
||||
|
||||
import re
|
||||
from .base import BaseNormalizer
|
||||
|
||||
|
||||
class AmountNormalizer(BaseNormalizer):
|
||||
"""
|
||||
Normalizes monetary amounts.
|
||||
|
||||
Handles Swedish and international formats with different
|
||||
thousand/decimal separators.
|
||||
|
||||
Examples:
|
||||
'114' -> ['114', '114,00', '114.00']
|
||||
'114,00' -> ['114,00', '114.00', '114']
|
||||
'1 234,56' -> ['1234,56', '1234.56', '1 234,56']
|
||||
'3045 52' -> ['3045.52', '3045,52', '304552']
|
||||
"""
|
||||
|
||||
def normalize(self, value: str) -> list[str]:
|
||||
"""Generate variants of amount."""
|
||||
value = self.clean_text(value)
|
||||
|
||||
# Remove currency symbols and common suffixes
|
||||
value = re.sub(r'[SEK|kr|:-]+$', '', value, flags=re.IGNORECASE).strip()
|
||||
|
||||
variants = [value]
|
||||
|
||||
# Check for space as decimal separator: "3045 52"
|
||||
space_decimal_match = re.match(r'^(\d+)\s+(\d{2})$', value)
|
||||
if space_decimal_match:
|
||||
integer_part = space_decimal_match.group(1)
|
||||
decimal_part = space_decimal_match.group(2)
|
||||
variants.append(f"{integer_part}.{decimal_part}")
|
||||
variants.append(f"{integer_part},{decimal_part}")
|
||||
variants.append(f"{integer_part}{decimal_part}")
|
||||
|
||||
# Check for space as thousand separator: "10 571,00"
|
||||
space_thousand_match = re.match(r'^(\d{1,3})[\s\xa0]+(\d{3})([,\.])(\d{2})$', value)
|
||||
if space_thousand_match:
|
||||
part1 = space_thousand_match.group(1)
|
||||
part2 = space_thousand_match.group(2)
|
||||
sep = space_thousand_match.group(3)
|
||||
decimal = space_thousand_match.group(4)
|
||||
combined = f"{part1}{part2}"
|
||||
variants.append(f"{combined}.{decimal}")
|
||||
variants.append(f"{combined},{decimal}")
|
||||
variants.append(f"{combined}{decimal}")
|
||||
other_sep = ',' if sep == '.' else '.'
|
||||
variants.append(f"{part1} {part2}{other_sep}{decimal}")
|
||||
|
||||
# Handle US format: "1,390.00"
|
||||
us_format_match = re.match(r'^(\d{1,3}),(\d{3})\.(\d{2})$', value)
|
||||
if us_format_match:
|
||||
part1 = us_format_match.group(1)
|
||||
part2 = us_format_match.group(2)
|
||||
decimal = us_format_match.group(3)
|
||||
combined = f"{part1}{part2}"
|
||||
variants.append(f"{combined}.{decimal}")
|
||||
variants.append(f"{combined},{decimal}")
|
||||
variants.append(combined)
|
||||
variants.append(f"{part1}.{part2},{decimal}")
|
||||
|
||||
# Handle European format: "1.390,00"
|
||||
eu_format_match = re.match(r'^(\d{1,3})\.(\d{3}),(\d{2})$', value)
|
||||
if eu_format_match:
|
||||
part1 = eu_format_match.group(1)
|
||||
part2 = eu_format_match.group(2)
|
||||
decimal = eu_format_match.group(3)
|
||||
combined = f"{part1}{part2}"
|
||||
variants.append(f"{combined}.{decimal}")
|
||||
variants.append(f"{combined},{decimal}")
|
||||
variants.append(combined)
|
||||
variants.append(f"{part1},{part2}.{decimal}")
|
||||
|
||||
# Remove spaces (thousand separators)
|
||||
no_space = value.replace(' ', '').replace('\xa0', '')
|
||||
|
||||
# Normalize decimal separator
|
||||
if ',' in no_space:
|
||||
dot_version = no_space.replace(',', '.')
|
||||
variants.append(no_space)
|
||||
variants.append(dot_version)
|
||||
elif '.' in no_space:
|
||||
comma_version = no_space.replace('.', ',')
|
||||
variants.append(no_space)
|
||||
variants.append(comma_version)
|
||||
else:
|
||||
# Integer amount - add decimal versions
|
||||
variants.append(no_space)
|
||||
variants.append(f"{no_space},00")
|
||||
variants.append(f"{no_space}.00")
|
||||
|
||||
# Try to parse and get clean numeric value
|
||||
try:
|
||||
clean = no_space.replace(',', '.')
|
||||
num = float(clean)
|
||||
|
||||
# Integer if no decimals
|
||||
if num == int(num):
|
||||
int_val = int(num)
|
||||
variants.append(str(int_val))
|
||||
variants.append(f"{int_val},00")
|
||||
variants.append(f"{int_val}.00")
|
||||
|
||||
# European format with dot as thousand separator
|
||||
if int_val >= 1000:
|
||||
formatted = f"{int_val:,}".replace(',', '.')
|
||||
variants.append(formatted)
|
||||
variants.append(f"{formatted},00")
|
||||
else:
|
||||
variants.append(f"{num:.2f}")
|
||||
variants.append(f"{num:.2f}".replace('.', ','))
|
||||
|
||||
# European format with dot as thousand separator
|
||||
if num >= 1000:
|
||||
formatted_str = f"{num:.2f}"
|
||||
int_str, dec_str = formatted_str.split(".")
|
||||
int_part = int(int_str)
|
||||
formatted_int = f"{int_part:,}".replace(',', '.')
|
||||
variants.append(f"{formatted_int},{dec_str}")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return list(set(v for v in variants if v))
|
||||
Reference in New Issue
Block a user