code issue fix

This commit is contained in:
Yaojia Wang
2026-01-17 18:55:46 +01:00
parent 510890d18c
commit e9460e9f34
9 changed files with 729 additions and 57 deletions

View File

@@ -260,6 +260,45 @@ class FieldNormalizer:
return list(set(v for v in variants if v))
@staticmethod
def normalize_customer_number(value: str) -> list[str]:
"""
Normalize customer number.
Customer numbers can have various formats:
- Alphanumeric codes: 'EMM 256-6', 'ABC123', 'A-1234'
- Pure numbers: '12345', '123-456'
Examples:
'EMM 256-6' -> ['EMM 256-6', 'EMM256-6', 'EMM2566']
'ABC 123' -> ['ABC 123', 'ABC123']
"""
value = FieldNormalizer.clean_text(value)
variants = [value]
# Version without spaces
no_space = value.replace(' ', '')
if no_space != value:
variants.append(no_space)
# Version without dashes
no_dash = value.replace('-', '')
if no_dash != value:
variants.append(no_dash)
# Version without spaces and dashes
clean = value.replace(' ', '').replace('-', '')
if clean != value and clean not in variants:
variants.append(clean)
# Uppercase and lowercase versions
if value.upper() != value:
variants.append(value.upper())
if value.lower() != value:
variants.append(value.lower())
return list(set(v for v in variants if v))
@staticmethod
def normalize_amount(value: str) -> list[str]:
"""
@@ -414,7 +453,7 @@ class FieldNormalizer:
]
# Ambiguous patterns - try both DD/MM and MM/DD interpretations
ambiguous_patterns = [
ambiguous_patterns_4digit_year = [
# Format with / - could be DD/MM/YYYY (European) or MM/DD/YYYY (US)
r'^(\d{1,2})/(\d{1,2})/(\d{4})$',
# Format with . - typically European DD.MM.YYYY
@@ -423,6 +462,16 @@ class FieldNormalizer:
r'^(\d{1,2})-(\d{1,2})-(\d{4})$',
]
# Patterns with 2-digit year (common in Swedish invoices)
ambiguous_patterns_2digit_year = [
# Format DD.MM.YY (e.g., 02.08.25 for 2025-08-02)
r'^(\d{1,2})\.(\d{1,2})\.(\d{2})$',
# Format DD/MM/YY
r'^(\d{1,2})/(\d{1,2})/(\d{2})$',
# Format DD-MM-YY
r'^(\d{1,2})-(\d{1,2})-(\d{2})$',
]
# Try unambiguous patterns first
for pattern, extractor in date_patterns:
match = re.match(pattern, value)
@@ -434,9 +483,9 @@ class FieldNormalizer:
except ValueError:
continue
# Try ambiguous patterns with both interpretations
# Try ambiguous patterns with 4-digit year
if not parsed_dates:
for pattern in ambiguous_patterns:
for pattern in ambiguous_patterns_4digit_year:
match = re.match(pattern, value)
if match:
n1, n2, year = int(match[1]), int(match[2]), int(match[3])
@@ -457,6 +506,31 @@ class FieldNormalizer:
if parsed_dates:
break
# Try ambiguous patterns with 2-digit year (e.g., 02.08.25)
if not parsed_dates:
for pattern in ambiguous_patterns_2digit_year:
match = re.match(pattern, value)
if match:
n1, n2, yy = int(match[1]), int(match[2]), int(match[3])
# Convert 2-digit year to 4-digit (00-49 -> 2000s, 50-99 -> 1900s)
year = 2000 + yy if yy < 50 else 1900 + yy
# Try DD/MM/YY (European - day first, most common in Sweden)
try:
parsed_dates.append(datetime(year, n2, n1))
except ValueError:
pass
# Try MM/DD/YY (US - month first) if different and valid
if n1 != n2:
try:
parsed_dates.append(datetime(year, n1, n2))
except ValueError:
pass
if parsed_dates:
break
# Try Swedish month names
if not parsed_dates:
for month_name, month_num in FieldNormalizer.SWEDISH_MONTHS.items():
@@ -527,6 +601,7 @@ NORMALIZERS: dict[str, Callable[[str], list[str]]] = {
'InvoiceDueDate': FieldNormalizer.normalize_date,
'supplier_organisation_number': FieldNormalizer.normalize_organisation_number,
'supplier_accounts': FieldNormalizer.normalize_supplier_accounts,
'customer_number': FieldNormalizer.normalize_customer_number,
}