This commit is contained in:
Yaojia Wang
2026-01-15 23:02:38 +01:00
parent b26fd61852
commit 53d1e8db25

View File

@@ -139,17 +139,71 @@ class FieldNormalizer:
'114' -> ['114', '114,00', '114.00']
'114,00' -> ['114,00', '114.00', '114']
'1 234,56' -> ['1234,56', '1234.56', '1 234,56']
'3045 52' -> ['3045.52', '3045,52', '304552'] (space as decimal sep)
"""
value = FieldNormalizer.clean_text(value)
# Remove currency symbols and common suffixes
value = re.sub(r'[SEK|kr|:-]+$', '', value, flags=re.IGNORECASE).strip()
# Remove spaces (thousand separators)
no_space = value.replace(' ', '').replace('\xa0', '')
variants = [value]
# Check for space as decimal separator pattern: "3045 52" (number space 2-digits)
# This is common in Swedish invoices where space separates öre from kronor
space_decimal_match = re.match(r'^(\d+)\s+(\d{2})$', value)
if space_decimal_match:
integer_part = space_decimal_match.group(1)
decimal_part = space_decimal_match.group(2)
# Add variants with different decimal separators
variants.append(f"{integer_part}.{decimal_part}")
variants.append(f"{integer_part},{decimal_part}")
variants.append(f"{integer_part}{decimal_part}") # No separator
# Check for space as thousand separator with decimal: "10 571,00" or "10 571.00"
# Pattern: digits space digits comma/dot 2-digits
space_thousand_match = re.match(r'^(\d{1,3})[\s\xa0]+(\d{3})([,\.])(\d{2})$', value)
if space_thousand_match:
part1 = space_thousand_match.group(1)
part2 = space_thousand_match.group(2)
sep = space_thousand_match.group(3)
decimal = space_thousand_match.group(4)
combined = f"{part1}{part2}"
variants.append(f"{combined}.{decimal}")
variants.append(f"{combined},{decimal}")
variants.append(f"{combined}{decimal}")
# Also add variant with space preserved but different decimal sep
other_sep = ',' if sep == '.' else '.'
variants.append(f"{part1} {part2}{other_sep}{decimal}")
# Handle US format: "1,390.00" (comma as thousand separator, dot as decimal)
us_format_match = re.match(r'^(\d{1,3}),(\d{3})\.(\d{2})$', value)
if us_format_match:
part1 = us_format_match.group(1)
part2 = us_format_match.group(2)
decimal = us_format_match.group(3)
combined = f"{part1}{part2}"
variants.append(f"{combined}.{decimal}")
variants.append(f"{combined},{decimal}")
variants.append(combined) # Without decimal
# European format: 1.390,00
variants.append(f"{part1}.{part2},{decimal}")
# Handle European format: "1.390,00" (dot as thousand separator, comma as decimal)
eu_format_match = re.match(r'^(\d{1,3})\.(\d{3}),(\d{2})$', value)
if eu_format_match:
part1 = eu_format_match.group(1)
part2 = eu_format_match.group(2)
decimal = eu_format_match.group(3)
combined = f"{part1}{part2}"
variants.append(f"{combined}.{decimal}")
variants.append(f"{combined},{decimal}")
variants.append(combined) # Without decimal
# US format: 1,390.00
variants.append(f"{part1},{part2}.{decimal}")
# Remove spaces (thousand separators) including non-breaking space
no_space = value.replace(' ', '').replace('\xa0', '')
# Normalize decimal separator
if ',' in no_space:
dot_version = no_space.replace(',', '.')
@@ -190,12 +244,12 @@ class FieldNormalizer:
# European format with dot as thousand separator
if num >= 1000:
# Split integer and decimal parts
int_part = int(num)
dec_part = num - int_part
# Split integer and decimal parts using string formatting to avoid precision loss
formatted_str = f"{num:.2f}"
int_str, dec_str = formatted_str.split(".")
int_part = int(int_str)
formatted_int = f"{int_part:,}".replace(',', '.')
formatted = f"{formatted_int},{dec_part:.2f}"[2:] # Remove "0."
variants.append(f"{formatted_int},{int(dec_part * 100):02d}") # 20.485,00
variants.append(f"{formatted_int},{dec_str}") # 3.045,52
except ValueError:
pass