From 53d1e8db255dc9ca8f3c624fc57f087fd97584db Mon Sep 17 00:00:00 2001 From: Yaojia Wang Date: Thu, 15 Jan 2026 23:02:38 +0100 Subject: [PATCH] Enhance. --- src/normalize/normalizer.py | 70 ++++++++++++++++++++++++++++++++----- 1 file changed, 62 insertions(+), 8 deletions(-) diff --git a/src/normalize/normalizer.py b/src/normalize/normalizer.py index 31c84ad..f802a67 100644 --- a/src/normalize/normalizer.py +++ b/src/normalize/normalizer.py @@ -139,17 +139,71 @@ class FieldNormalizer: '114' -> ['114', '114,00', '114.00'] '114,00' -> ['114,00', '114.00', '114'] '1 234,56' -> ['1234,56', '1234.56', '1 234,56'] + '3045 52' -> ['3045.52', '3045,52', '304552'] (space as decimal sep) """ value = FieldNormalizer.clean_text(value) # Remove currency symbols and common suffixes value = re.sub(r'[SEK|kr|:-]+$', '', value, flags=re.IGNORECASE).strip() - # Remove spaces (thousand separators) - no_space = value.replace(' ', '').replace('\xa0', '') - variants = [value] + # Check for space as decimal separator pattern: "3045 52" (number space 2-digits) + # This is common in Swedish invoices where space separates öre from kronor + space_decimal_match = re.match(r'^(\d+)\s+(\d{2})$', value) + if space_decimal_match: + integer_part = space_decimal_match.group(1) + decimal_part = space_decimal_match.group(2) + # Add variants with different decimal separators + variants.append(f"{integer_part}.{decimal_part}") + variants.append(f"{integer_part},{decimal_part}") + variants.append(f"{integer_part}{decimal_part}") # No separator + + # Check for space as thousand separator with decimal: "10 571,00" or "10 571.00" + # Pattern: digits space digits comma/dot 2-digits + space_thousand_match = re.match(r'^(\d{1,3})[\s\xa0]+(\d{3})([,\.])(\d{2})$', value) + if space_thousand_match: + part1 = space_thousand_match.group(1) + part2 = space_thousand_match.group(2) + sep = space_thousand_match.group(3) + decimal = space_thousand_match.group(4) + combined = f"{part1}{part2}" + variants.append(f"{combined}.{decimal}") + variants.append(f"{combined},{decimal}") + variants.append(f"{combined}{decimal}") + # Also add variant with space preserved but different decimal sep + other_sep = ',' if sep == '.' else '.' + variants.append(f"{part1} {part2}{other_sep}{decimal}") + + # Handle US format: "1,390.00" (comma as thousand separator, dot as decimal) + us_format_match = re.match(r'^(\d{1,3}),(\d{3})\.(\d{2})$', value) + if us_format_match: + part1 = us_format_match.group(1) + part2 = us_format_match.group(2) + decimal = us_format_match.group(3) + combined = f"{part1}{part2}" + variants.append(f"{combined}.{decimal}") + variants.append(f"{combined},{decimal}") + variants.append(combined) # Without decimal + # European format: 1.390,00 + variants.append(f"{part1}.{part2},{decimal}") + + # Handle European format: "1.390,00" (dot as thousand separator, comma as decimal) + eu_format_match = re.match(r'^(\d{1,3})\.(\d{3}),(\d{2})$', value) + if eu_format_match: + part1 = eu_format_match.group(1) + part2 = eu_format_match.group(2) + decimal = eu_format_match.group(3) + combined = f"{part1}{part2}" + variants.append(f"{combined}.{decimal}") + variants.append(f"{combined},{decimal}") + variants.append(combined) # Without decimal + # US format: 1,390.00 + variants.append(f"{part1},{part2}.{decimal}") + + # Remove spaces (thousand separators) including non-breaking space + no_space = value.replace(' ', '').replace('\xa0', '') + # Normalize decimal separator if ',' in no_space: dot_version = no_space.replace(',', '.') @@ -190,12 +244,12 @@ class FieldNormalizer: # European format with dot as thousand separator if num >= 1000: - # Split integer and decimal parts - int_part = int(num) - dec_part = num - int_part + # Split integer and decimal parts using string formatting to avoid precision loss + formatted_str = f"{num:.2f}" + int_str, dec_str = formatted_str.split(".") + int_part = int(int_str) formatted_int = f"{int_part:,}".replace(',', '.') - formatted = f"{formatted_int},{dec_part:.2f}"[2:] # Remove "0." - variants.append(f"{formatted_int},{int(dec_part * 100):02d}") # 20.485,00 + variants.append(f"{formatted_int},{dec_str}") # 3.045,52 except ValueError: pass