WIP
This commit is contained in:
@@ -39,9 +39,12 @@ class FieldNormalizer:
|
||||
|
||||
@staticmethod
|
||||
def clean_text(text: str) -> str:
|
||||
"""Remove invisible characters and normalize whitespace."""
|
||||
"""Remove invisible characters and normalize whitespace and dashes."""
|
||||
# Remove zero-width characters
|
||||
text = re.sub(r'[\u200b\u200c\u200d\ufeff]', '', text)
|
||||
# Normalize different dash types to standard hyphen-minus (ASCII 45)
|
||||
# en-dash (–, U+2013), em-dash (—, U+2014), minus sign (−, U+2212)
|
||||
text = re.sub(r'[\u2013\u2014\u2212]', '-', text)
|
||||
# Normalize whitespace
|
||||
text = ' '.join(text.split())
|
||||
return text.strip()
|
||||
@@ -130,6 +133,133 @@ class FieldNormalizer:
|
||||
|
||||
return list(set(v for v in variants if v))
|
||||
|
||||
@staticmethod
|
||||
def normalize_organisation_number(value: str) -> list[str]:
|
||||
"""
|
||||
Normalize Swedish organisation number and generate VAT number variants.
|
||||
|
||||
Organisation number format: NNNNNN-NNNN (6 digits + hyphen + 4 digits)
|
||||
Swedish VAT format: SE + org_number (10 digits) + 01
|
||||
|
||||
Examples:
|
||||
'556123-4567' -> ['556123-4567', '5561234567', 'SE556123456701', ...]
|
||||
'5561234567' -> ['5561234567', '556123-4567', 'SE556123456701', ...]
|
||||
'SE556123456701' -> ['SE556123456701', '5561234567', '556123-4567', ...]
|
||||
"""
|
||||
value = FieldNormalizer.clean_text(value)
|
||||
|
||||
# Check if input is a VAT number (starts with SE, ends with 01)
|
||||
org_digits = None
|
||||
if value.upper().startswith('SE') and len(value) >= 12:
|
||||
# Extract org number from VAT: SE + 10 digits + 01
|
||||
potential_org = re.sub(r'\D', '', value[2:]) # Remove SE prefix, keep digits
|
||||
if len(potential_org) == 12 and potential_org.endswith('01'):
|
||||
org_digits = potential_org[:-2] # Remove trailing 01
|
||||
elif len(potential_org) == 10:
|
||||
org_digits = potential_org
|
||||
|
||||
if org_digits is None:
|
||||
org_digits = re.sub(r'\D', '', value)
|
||||
|
||||
variants = [value]
|
||||
|
||||
if org_digits:
|
||||
variants.append(org_digits)
|
||||
|
||||
# Standard format: NNNNNN-NNNN (10 digits total)
|
||||
if len(org_digits) == 10:
|
||||
with_dash = f"{org_digits[:6]}-{org_digits[6:]}"
|
||||
variants.append(with_dash)
|
||||
|
||||
# Swedish VAT format: SE + org_number + 01
|
||||
vat_number = f"SE{org_digits}01"
|
||||
variants.append(vat_number)
|
||||
variants.append(vat_number.lower()) # se556123456701
|
||||
# With spaces: SE 5561234567 01
|
||||
variants.append(f"SE {org_digits} 01")
|
||||
variants.append(f"SE {org_digits[:6]}-{org_digits[6:]} 01")
|
||||
# Without 01 suffix (some invoices show just SE + org)
|
||||
variants.append(f"SE{org_digits}")
|
||||
variants.append(f"SE {org_digits}")
|
||||
|
||||
# Some may have 12 digits (century prefix): NNNNNNNN-NNNN
|
||||
elif len(org_digits) == 12:
|
||||
with_dash = f"{org_digits[:8]}-{org_digits[8:]}"
|
||||
variants.append(with_dash)
|
||||
# Also try without century prefix
|
||||
short_version = org_digits[2:]
|
||||
variants.append(short_version)
|
||||
variants.append(f"{short_version[:6]}-{short_version[6:]}")
|
||||
# VAT with short version
|
||||
vat_number = f"SE{short_version}01"
|
||||
variants.append(vat_number)
|
||||
|
||||
return list(set(v for v in variants if v))
|
||||
|
||||
@staticmethod
|
||||
def normalize_supplier_accounts(value: str) -> list[str]:
|
||||
"""
|
||||
Normalize supplier accounts field.
|
||||
|
||||
The field may contain multiple accounts separated by ' | '.
|
||||
Format examples:
|
||||
'PG:48676043 | PG:49128028 | PG:8915035'
|
||||
'BG:5393-9484'
|
||||
|
||||
Each account is normalized separately to generate variants.
|
||||
|
||||
Examples:
|
||||
'PG:48676043' -> ['PG:48676043', '48676043', '4867604-3']
|
||||
'BG:5393-9484' -> ['BG:5393-9484', '5393-9484', '53939484']
|
||||
"""
|
||||
value = FieldNormalizer.clean_text(value)
|
||||
variants = []
|
||||
|
||||
# Split by ' | ' to handle multiple accounts
|
||||
accounts = [acc.strip() for acc in value.split('|')]
|
||||
|
||||
for account in accounts:
|
||||
account = account.strip()
|
||||
if not account:
|
||||
continue
|
||||
|
||||
# Add original value
|
||||
variants.append(account)
|
||||
|
||||
# Remove prefix (PG:, BG:, etc.)
|
||||
if ':' in account:
|
||||
prefix, number = account.split(':', 1)
|
||||
number = number.strip()
|
||||
variants.append(number) # Just the number without prefix
|
||||
|
||||
# Also add with different prefix formats
|
||||
prefix_upper = prefix.strip().upper()
|
||||
variants.append(f"{prefix_upper}:{number}")
|
||||
variants.append(f"{prefix_upper}: {number}") # With space
|
||||
else:
|
||||
number = account
|
||||
|
||||
# Extract digits only
|
||||
digits_only = re.sub(r'\D', '', number)
|
||||
|
||||
if digits_only:
|
||||
variants.append(digits_only)
|
||||
|
||||
# Plusgiro format: XXXXXXX-X (7 digits + check digit)
|
||||
if len(digits_only) == 8:
|
||||
with_dash = f"{digits_only[:-1]}-{digits_only[-1]}"
|
||||
variants.append(with_dash)
|
||||
# Also try 4-4 format for bankgiro
|
||||
variants.append(f"{digits_only[:4]}-{digits_only[4:]}")
|
||||
elif len(digits_only) == 7:
|
||||
with_dash = f"{digits_only[:-1]}-{digits_only[-1]}"
|
||||
variants.append(with_dash)
|
||||
elif len(digits_only) == 10:
|
||||
# 6-4 format (like org number)
|
||||
variants.append(f"{digits_only[:6]}-{digits_only[6:]}")
|
||||
|
||||
return list(set(v for v in variants if v))
|
||||
|
||||
@staticmethod
|
||||
def normalize_amount(value: str) -> list[str]:
|
||||
"""
|
||||
@@ -264,40 +394,71 @@ class FieldNormalizer:
|
||||
'2025-12-13' -> ['2025-12-13', '13/12/2025', '13.12.2025']
|
||||
'13/12/2025' -> ['2025-12-13', '13/12/2025', ...]
|
||||
'13 december 2025' -> ['2025-12-13', ...]
|
||||
|
||||
Note: For ambiguous formats like DD/MM/YYYY vs MM/DD/YYYY,
|
||||
we generate variants for BOTH interpretations to maximize matching.
|
||||
"""
|
||||
value = FieldNormalizer.clean_text(value)
|
||||
variants = [value]
|
||||
|
||||
parsed_date = None
|
||||
parsed_dates = [] # May have multiple interpretations
|
||||
|
||||
# Try different date formats
|
||||
date_patterns = [
|
||||
# ISO format with optional time (e.g., 2026-01-09 00:00:00)
|
||||
(r'^(\d{4})-(\d{1,2})-(\d{1,2})(?:\s+\d{1,2}:\d{2}:\d{2})?$', lambda m: (int(m[1]), int(m[2]), int(m[3]))),
|
||||
# European format with /
|
||||
(r'^(\d{1,2})/(\d{1,2})/(\d{4})$', lambda m: (int(m[3]), int(m[2]), int(m[1]))),
|
||||
# European format with .
|
||||
(r'^(\d{1,2})\.(\d{1,2})\.(\d{4})$', lambda m: (int(m[3]), int(m[2]), int(m[1]))),
|
||||
# European format with -
|
||||
(r'^(\d{1,2})-(\d{1,2})-(\d{4})$', lambda m: (int(m[3]), int(m[2]), int(m[1]))),
|
||||
# Swedish format: YYMMDD
|
||||
(r'^(\d{2})(\d{2})(\d{2})$', lambda m: (2000 + int(m[1]) if int(m[1]) < 50 else 1900 + int(m[1]), int(m[2]), int(m[3]))),
|
||||
# Swedish format: YYYYMMDD
|
||||
(r'^(\d{4})(\d{2})(\d{2})$', lambda m: (int(m[1]), int(m[2]), int(m[3]))),
|
||||
]
|
||||
|
||||
# Ambiguous patterns - try both DD/MM and MM/DD interpretations
|
||||
ambiguous_patterns = [
|
||||
# Format with / - could be DD/MM/YYYY (European) or MM/DD/YYYY (US)
|
||||
r'^(\d{1,2})/(\d{1,2})/(\d{4})$',
|
||||
# Format with . - typically European DD.MM.YYYY
|
||||
r'^(\d{1,2})\.(\d{1,2})\.(\d{4})$',
|
||||
# Format with - (not ISO) - could be DD-MM-YYYY or MM-DD-YYYY
|
||||
r'^(\d{1,2})-(\d{1,2})-(\d{4})$',
|
||||
]
|
||||
|
||||
# Try unambiguous patterns first
|
||||
for pattern, extractor in date_patterns:
|
||||
match = re.match(pattern, value)
|
||||
if match:
|
||||
try:
|
||||
year, month, day = extractor(match)
|
||||
parsed_date = datetime(year, month, day)
|
||||
parsed_dates.append(datetime(year, month, day))
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Try ambiguous patterns with both interpretations
|
||||
if not parsed_dates:
|
||||
for pattern in ambiguous_patterns:
|
||||
match = re.match(pattern, value)
|
||||
if match:
|
||||
n1, n2, year = int(match[1]), int(match[2]), int(match[3])
|
||||
|
||||
# Try DD/MM/YYYY (European - day first)
|
||||
try:
|
||||
parsed_dates.append(datetime(year, n2, n1))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Try MM/DD/YYYY (US - month first) if different and valid
|
||||
if n1 != n2:
|
||||
try:
|
||||
parsed_dates.append(datetime(year, n1, n2))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if parsed_dates:
|
||||
break
|
||||
|
||||
# Try Swedish month names
|
||||
if not parsed_date:
|
||||
if not parsed_dates:
|
||||
for month_name, month_num in FieldNormalizer.SWEDISH_MONTHS.items():
|
||||
if month_name in value.lower():
|
||||
# Extract day and year
|
||||
@@ -308,16 +469,28 @@ class FieldNormalizer:
|
||||
if year < 100:
|
||||
year = 2000 + year if year < 50 else 1900 + year
|
||||
try:
|
||||
parsed_date = datetime(year, int(month_num), day)
|
||||
parsed_dates.append(datetime(year, int(month_num), day))
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
if parsed_date:
|
||||
# Generate variants for all parsed date interpretations
|
||||
swedish_months_full = [
|
||||
'januari', 'februari', 'mars', 'april', 'maj', 'juni',
|
||||
'juli', 'augusti', 'september', 'oktober', 'november', 'december'
|
||||
]
|
||||
swedish_months_abbrev = [
|
||||
'jan', 'feb', 'mar', 'apr', 'maj', 'jun',
|
||||
'jul', 'aug', 'sep', 'okt', 'nov', 'dec'
|
||||
]
|
||||
|
||||
for parsed_date in parsed_dates:
|
||||
# Generate different formats
|
||||
iso = parsed_date.strftime('%Y-%m-%d')
|
||||
eu_slash = parsed_date.strftime('%d/%m/%Y')
|
||||
us_slash = parsed_date.strftime('%m/%d/%Y') # US format MM/DD/YYYY
|
||||
eu_dot = parsed_date.strftime('%d.%m.%Y')
|
||||
iso_dot = parsed_date.strftime('%Y.%m.%d') # ISO with dots (e.g., 2024.02.08)
|
||||
compact = parsed_date.strftime('%Y%m%d') # YYYYMMDD
|
||||
compact_short = parsed_date.strftime('%y%m%d') # YYMMDD (e.g., 260108)
|
||||
|
||||
@@ -329,21 +502,13 @@ class FieldNormalizer:
|
||||
spaced_short = parsed_date.strftime('%y %m %d')
|
||||
|
||||
# Swedish month name formats (e.g., "9 januari 2026", "9 jan 2026")
|
||||
swedish_months_full = [
|
||||
'januari', 'februari', 'mars', 'april', 'maj', 'juni',
|
||||
'juli', 'augusti', 'september', 'oktober', 'november', 'december'
|
||||
]
|
||||
swedish_months_abbrev = [
|
||||
'jan', 'feb', 'mar', 'apr', 'maj', 'jun',
|
||||
'jul', 'aug', 'sep', 'okt', 'nov', 'dec'
|
||||
]
|
||||
month_full = swedish_months_full[parsed_date.month - 1]
|
||||
month_abbrev = swedish_months_abbrev[parsed_date.month - 1]
|
||||
swedish_format_full = f"{parsed_date.day} {month_full} {parsed_date.year}"
|
||||
swedish_format_abbrev = f"{parsed_date.day} {month_abbrev} {parsed_date.year}"
|
||||
|
||||
variants.extend([
|
||||
iso, eu_slash, eu_dot, compact, compact_short,
|
||||
iso, eu_slash, us_slash, eu_dot, iso_dot, compact, compact_short,
|
||||
eu_dot_short, spaced_full, spaced_short,
|
||||
swedish_format_full, swedish_format_abbrev
|
||||
])
|
||||
@@ -360,6 +525,8 @@ NORMALIZERS: dict[str, Callable[[str], list[str]]] = {
|
||||
'Amount': FieldNormalizer.normalize_amount,
|
||||
'InvoiceDate': FieldNormalizer.normalize_date,
|
||||
'InvoiceDueDate': FieldNormalizer.normalize_date,
|
||||
'supplier_organisation_number': FieldNormalizer.normalize_organisation_number,
|
||||
'supplier_accounts': FieldNormalizer.normalize_supplier_accounts,
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user