This commit is contained in:
Yaojia Wang
2026-01-16 23:10:01 +01:00
parent 53d1e8db25
commit 425b8fdedf
10 changed files with 653 additions and 87 deletions

View File

@@ -39,9 +39,12 @@ class FieldNormalizer:
@staticmethod
def clean_text(text: str) -> str:
"""Remove invisible characters and normalize whitespace."""
"""Remove invisible characters and normalize whitespace and dashes."""
# Remove zero-width characters
text = re.sub(r'[\u200b\u200c\u200d\ufeff]', '', text)
# Normalize different dash types to standard hyphen-minus (ASCII 45)
# en-dash (, U+2013), em-dash (—, U+2014), minus sign (, U+2212)
text = re.sub(r'[\u2013\u2014\u2212]', '-', text)
# Normalize whitespace
text = ' '.join(text.split())
return text.strip()
@@ -130,6 +133,133 @@ class FieldNormalizer:
return list(set(v for v in variants if v))
@staticmethod
def normalize_organisation_number(value: str) -> list[str]:
"""
Normalize Swedish organisation number and generate VAT number variants.
Organisation number format: NNNNNN-NNNN (6 digits + hyphen + 4 digits)
Swedish VAT format: SE + org_number (10 digits) + 01
Examples:
'556123-4567' -> ['556123-4567', '5561234567', 'SE556123456701', ...]
'5561234567' -> ['5561234567', '556123-4567', 'SE556123456701', ...]
'SE556123456701' -> ['SE556123456701', '5561234567', '556123-4567', ...]
"""
value = FieldNormalizer.clean_text(value)
# Check if input is a VAT number (starts with SE, ends with 01)
org_digits = None
if value.upper().startswith('SE') and len(value) >= 12:
# Extract org number from VAT: SE + 10 digits + 01
potential_org = re.sub(r'\D', '', value[2:]) # Remove SE prefix, keep digits
if len(potential_org) == 12 and potential_org.endswith('01'):
org_digits = potential_org[:-2] # Remove trailing 01
elif len(potential_org) == 10:
org_digits = potential_org
if org_digits is None:
org_digits = re.sub(r'\D', '', value)
variants = [value]
if org_digits:
variants.append(org_digits)
# Standard format: NNNNNN-NNNN (10 digits total)
if len(org_digits) == 10:
with_dash = f"{org_digits[:6]}-{org_digits[6:]}"
variants.append(with_dash)
# Swedish VAT format: SE + org_number + 01
vat_number = f"SE{org_digits}01"
variants.append(vat_number)
variants.append(vat_number.lower()) # se556123456701
# With spaces: SE 5561234567 01
variants.append(f"SE {org_digits} 01")
variants.append(f"SE {org_digits[:6]}-{org_digits[6:]} 01")
# Without 01 suffix (some invoices show just SE + org)
variants.append(f"SE{org_digits}")
variants.append(f"SE {org_digits}")
# Some may have 12 digits (century prefix): NNNNNNNN-NNNN
elif len(org_digits) == 12:
with_dash = f"{org_digits[:8]}-{org_digits[8:]}"
variants.append(with_dash)
# Also try without century prefix
short_version = org_digits[2:]
variants.append(short_version)
variants.append(f"{short_version[:6]}-{short_version[6:]}")
# VAT with short version
vat_number = f"SE{short_version}01"
variants.append(vat_number)
return list(set(v for v in variants if v))
@staticmethod
def normalize_supplier_accounts(value: str) -> list[str]:
"""
Normalize supplier accounts field.
The field may contain multiple accounts separated by ' | '.
Format examples:
'PG:48676043 | PG:49128028 | PG:8915035'
'BG:5393-9484'
Each account is normalized separately to generate variants.
Examples:
'PG:48676043' -> ['PG:48676043', '48676043', '4867604-3']
'BG:5393-9484' -> ['BG:5393-9484', '5393-9484', '53939484']
"""
value = FieldNormalizer.clean_text(value)
variants = []
# Split by ' | ' to handle multiple accounts
accounts = [acc.strip() for acc in value.split('|')]
for account in accounts:
account = account.strip()
if not account:
continue
# Add original value
variants.append(account)
# Remove prefix (PG:, BG:, etc.)
if ':' in account:
prefix, number = account.split(':', 1)
number = number.strip()
variants.append(number) # Just the number without prefix
# Also add with different prefix formats
prefix_upper = prefix.strip().upper()
variants.append(f"{prefix_upper}:{number}")
variants.append(f"{prefix_upper}: {number}") # With space
else:
number = account
# Extract digits only
digits_only = re.sub(r'\D', '', number)
if digits_only:
variants.append(digits_only)
# Plusgiro format: XXXXXXX-X (7 digits + check digit)
if len(digits_only) == 8:
with_dash = f"{digits_only[:-1]}-{digits_only[-1]}"
variants.append(with_dash)
# Also try 4-4 format for bankgiro
variants.append(f"{digits_only[:4]}-{digits_only[4:]}")
elif len(digits_only) == 7:
with_dash = f"{digits_only[:-1]}-{digits_only[-1]}"
variants.append(with_dash)
elif len(digits_only) == 10:
# 6-4 format (like org number)
variants.append(f"{digits_only[:6]}-{digits_only[6:]}")
return list(set(v for v in variants if v))
@staticmethod
def normalize_amount(value: str) -> list[str]:
"""
@@ -264,40 +394,71 @@ class FieldNormalizer:
'2025-12-13' -> ['2025-12-13', '13/12/2025', '13.12.2025']
'13/12/2025' -> ['2025-12-13', '13/12/2025', ...]
'13 december 2025' -> ['2025-12-13', ...]
Note: For ambiguous formats like DD/MM/YYYY vs MM/DD/YYYY,
we generate variants for BOTH interpretations to maximize matching.
"""
value = FieldNormalizer.clean_text(value)
variants = [value]
parsed_date = None
parsed_dates = [] # May have multiple interpretations
# Try different date formats
date_patterns = [
# ISO format with optional time (e.g., 2026-01-09 00:00:00)
(r'^(\d{4})-(\d{1,2})-(\d{1,2})(?:\s+\d{1,2}:\d{2}:\d{2})?$', lambda m: (int(m[1]), int(m[2]), int(m[3]))),
# European format with /
(r'^(\d{1,2})/(\d{1,2})/(\d{4})$', lambda m: (int(m[3]), int(m[2]), int(m[1]))),
# European format with .
(r'^(\d{1,2})\.(\d{1,2})\.(\d{4})$', lambda m: (int(m[3]), int(m[2]), int(m[1]))),
# European format with -
(r'^(\d{1,2})-(\d{1,2})-(\d{4})$', lambda m: (int(m[3]), int(m[2]), int(m[1]))),
# Swedish format: YYMMDD
(r'^(\d{2})(\d{2})(\d{2})$', lambda m: (2000 + int(m[1]) if int(m[1]) < 50 else 1900 + int(m[1]), int(m[2]), int(m[3]))),
# Swedish format: YYYYMMDD
(r'^(\d{4})(\d{2})(\d{2})$', lambda m: (int(m[1]), int(m[2]), int(m[3]))),
]
# Ambiguous patterns - try both DD/MM and MM/DD interpretations
ambiguous_patterns = [
# Format with / - could be DD/MM/YYYY (European) or MM/DD/YYYY (US)
r'^(\d{1,2})/(\d{1,2})/(\d{4})$',
# Format with . - typically European DD.MM.YYYY
r'^(\d{1,2})\.(\d{1,2})\.(\d{4})$',
# Format with - (not ISO) - could be DD-MM-YYYY or MM-DD-YYYY
r'^(\d{1,2})-(\d{1,2})-(\d{4})$',
]
# Try unambiguous patterns first
for pattern, extractor in date_patterns:
match = re.match(pattern, value)
if match:
try:
year, month, day = extractor(match)
parsed_date = datetime(year, month, day)
parsed_dates.append(datetime(year, month, day))
break
except ValueError:
continue
# Try ambiguous patterns with both interpretations
if not parsed_dates:
for pattern in ambiguous_patterns:
match = re.match(pattern, value)
if match:
n1, n2, year = int(match[1]), int(match[2]), int(match[3])
# Try DD/MM/YYYY (European - day first)
try:
parsed_dates.append(datetime(year, n2, n1))
except ValueError:
pass
# Try MM/DD/YYYY (US - month first) if different and valid
if n1 != n2:
try:
parsed_dates.append(datetime(year, n1, n2))
except ValueError:
pass
if parsed_dates:
break
# Try Swedish month names
if not parsed_date:
if not parsed_dates:
for month_name, month_num in FieldNormalizer.SWEDISH_MONTHS.items():
if month_name in value.lower():
# Extract day and year
@@ -308,16 +469,28 @@ class FieldNormalizer:
if year < 100:
year = 2000 + year if year < 50 else 1900 + year
try:
parsed_date = datetime(year, int(month_num), day)
parsed_dates.append(datetime(year, int(month_num), day))
break
except ValueError:
continue
if parsed_date:
# Generate variants for all parsed date interpretations
swedish_months_full = [
'januari', 'februari', 'mars', 'april', 'maj', 'juni',
'juli', 'augusti', 'september', 'oktober', 'november', 'december'
]
swedish_months_abbrev = [
'jan', 'feb', 'mar', 'apr', 'maj', 'jun',
'jul', 'aug', 'sep', 'okt', 'nov', 'dec'
]
for parsed_date in parsed_dates:
# Generate different formats
iso = parsed_date.strftime('%Y-%m-%d')
eu_slash = parsed_date.strftime('%d/%m/%Y')
us_slash = parsed_date.strftime('%m/%d/%Y') # US format MM/DD/YYYY
eu_dot = parsed_date.strftime('%d.%m.%Y')
iso_dot = parsed_date.strftime('%Y.%m.%d') # ISO with dots (e.g., 2024.02.08)
compact = parsed_date.strftime('%Y%m%d') # YYYYMMDD
compact_short = parsed_date.strftime('%y%m%d') # YYMMDD (e.g., 260108)
@@ -329,21 +502,13 @@ class FieldNormalizer:
spaced_short = parsed_date.strftime('%y %m %d')
# Swedish month name formats (e.g., "9 januari 2026", "9 jan 2026")
swedish_months_full = [
'januari', 'februari', 'mars', 'april', 'maj', 'juni',
'juli', 'augusti', 'september', 'oktober', 'november', 'december'
]
swedish_months_abbrev = [
'jan', 'feb', 'mar', 'apr', 'maj', 'jun',
'jul', 'aug', 'sep', 'okt', 'nov', 'dec'
]
month_full = swedish_months_full[parsed_date.month - 1]
month_abbrev = swedish_months_abbrev[parsed_date.month - 1]
swedish_format_full = f"{parsed_date.day} {month_full} {parsed_date.year}"
swedish_format_abbrev = f"{parsed_date.day} {month_abbrev} {parsed_date.year}"
variants.extend([
iso, eu_slash, eu_dot, compact, compact_short,
iso, eu_slash, us_slash, eu_dot, iso_dot, compact, compact_short,
eu_dot_short, spaced_full, spaced_short,
swedish_format_full, swedish_format_abbrev
])
@@ -360,6 +525,8 @@ NORMALIZERS: dict[str, Callable[[str], list[str]]] = {
'Amount': FieldNormalizer.normalize_amount,
'InvoiceDate': FieldNormalizer.normalize_date,
'InvoiceDueDate': FieldNormalizer.normalize_date,
'supplier_organisation_number': FieldNormalizer.normalize_organisation_number,
'supplier_accounts': FieldNormalizer.normalize_supplier_accounts,
}