Enhance.
This commit is contained in:
@@ -139,17 +139,71 @@ class FieldNormalizer:
|
|||||||
'114' -> ['114', '114,00', '114.00']
|
'114' -> ['114', '114,00', '114.00']
|
||||||
'114,00' -> ['114,00', '114.00', '114']
|
'114,00' -> ['114,00', '114.00', '114']
|
||||||
'1 234,56' -> ['1234,56', '1234.56', '1 234,56']
|
'1 234,56' -> ['1234,56', '1234.56', '1 234,56']
|
||||||
|
'3045 52' -> ['3045.52', '3045,52', '304552'] (space as decimal sep)
|
||||||
"""
|
"""
|
||||||
value = FieldNormalizer.clean_text(value)
|
value = FieldNormalizer.clean_text(value)
|
||||||
|
|
||||||
# Remove currency symbols and common suffixes
|
# Remove currency symbols and common suffixes
|
||||||
value = re.sub(r'[SEK|kr|:-]+$', '', value, flags=re.IGNORECASE).strip()
|
value = re.sub(r'[SEK|kr|:-]+$', '', value, flags=re.IGNORECASE).strip()
|
||||||
|
|
||||||
# Remove spaces (thousand separators)
|
|
||||||
no_space = value.replace(' ', '').replace('\xa0', '')
|
|
||||||
|
|
||||||
variants = [value]
|
variants = [value]
|
||||||
|
|
||||||
|
# Check for space as decimal separator pattern: "3045 52" (number space 2-digits)
|
||||||
|
# This is common in Swedish invoices where space separates öre from kronor
|
||||||
|
space_decimal_match = re.match(r'^(\d+)\s+(\d{2})$', value)
|
||||||
|
if space_decimal_match:
|
||||||
|
integer_part = space_decimal_match.group(1)
|
||||||
|
decimal_part = space_decimal_match.group(2)
|
||||||
|
# Add variants with different decimal separators
|
||||||
|
variants.append(f"{integer_part}.{decimal_part}")
|
||||||
|
variants.append(f"{integer_part},{decimal_part}")
|
||||||
|
variants.append(f"{integer_part}{decimal_part}") # No separator
|
||||||
|
|
||||||
|
# Check for space as thousand separator with decimal: "10 571,00" or "10 571.00"
|
||||||
|
# Pattern: digits space digits comma/dot 2-digits
|
||||||
|
space_thousand_match = re.match(r'^(\d{1,3})[\s\xa0]+(\d{3})([,\.])(\d{2})$', value)
|
||||||
|
if space_thousand_match:
|
||||||
|
part1 = space_thousand_match.group(1)
|
||||||
|
part2 = space_thousand_match.group(2)
|
||||||
|
sep = space_thousand_match.group(3)
|
||||||
|
decimal = space_thousand_match.group(4)
|
||||||
|
combined = f"{part1}{part2}"
|
||||||
|
variants.append(f"{combined}.{decimal}")
|
||||||
|
variants.append(f"{combined},{decimal}")
|
||||||
|
variants.append(f"{combined}{decimal}")
|
||||||
|
# Also add variant with space preserved but different decimal sep
|
||||||
|
other_sep = ',' if sep == '.' else '.'
|
||||||
|
variants.append(f"{part1} {part2}{other_sep}{decimal}")
|
||||||
|
|
||||||
|
# Handle US format: "1,390.00" (comma as thousand separator, dot as decimal)
|
||||||
|
us_format_match = re.match(r'^(\d{1,3}),(\d{3})\.(\d{2})$', value)
|
||||||
|
if us_format_match:
|
||||||
|
part1 = us_format_match.group(1)
|
||||||
|
part2 = us_format_match.group(2)
|
||||||
|
decimal = us_format_match.group(3)
|
||||||
|
combined = f"{part1}{part2}"
|
||||||
|
variants.append(f"{combined}.{decimal}")
|
||||||
|
variants.append(f"{combined},{decimal}")
|
||||||
|
variants.append(combined) # Without decimal
|
||||||
|
# European format: 1.390,00
|
||||||
|
variants.append(f"{part1}.{part2},{decimal}")
|
||||||
|
|
||||||
|
# Handle European format: "1.390,00" (dot as thousand separator, comma as decimal)
|
||||||
|
eu_format_match = re.match(r'^(\d{1,3})\.(\d{3}),(\d{2})$', value)
|
||||||
|
if eu_format_match:
|
||||||
|
part1 = eu_format_match.group(1)
|
||||||
|
part2 = eu_format_match.group(2)
|
||||||
|
decimal = eu_format_match.group(3)
|
||||||
|
combined = f"{part1}{part2}"
|
||||||
|
variants.append(f"{combined}.{decimal}")
|
||||||
|
variants.append(f"{combined},{decimal}")
|
||||||
|
variants.append(combined) # Without decimal
|
||||||
|
# US format: 1,390.00
|
||||||
|
variants.append(f"{part1},{part2}.{decimal}")
|
||||||
|
|
||||||
|
# Remove spaces (thousand separators) including non-breaking space
|
||||||
|
no_space = value.replace(' ', '').replace('\xa0', '')
|
||||||
|
|
||||||
# Normalize decimal separator
|
# Normalize decimal separator
|
||||||
if ',' in no_space:
|
if ',' in no_space:
|
||||||
dot_version = no_space.replace(',', '.')
|
dot_version = no_space.replace(',', '.')
|
||||||
@@ -190,12 +244,12 @@ class FieldNormalizer:
|
|||||||
|
|
||||||
# European format with dot as thousand separator
|
# European format with dot as thousand separator
|
||||||
if num >= 1000:
|
if num >= 1000:
|
||||||
# Split integer and decimal parts
|
# Split integer and decimal parts using string formatting to avoid precision loss
|
||||||
int_part = int(num)
|
formatted_str = f"{num:.2f}"
|
||||||
dec_part = num - int_part
|
int_str, dec_str = formatted_str.split(".")
|
||||||
|
int_part = int(int_str)
|
||||||
formatted_int = f"{int_part:,}".replace(',', '.')
|
formatted_int = f"{int_part:,}".replace(',', '.')
|
||||||
formatted = f"{formatted_int},{dec_part:.2f}"[2:] # Remove "0."
|
variants.append(f"{formatted_int},{dec_str}") # 3.045,52
|
||||||
variants.append(f"{formatted_int},{int(dec_part * 100):02d}") # 20.485,00
|
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user