This commit is contained in:
Yaojia Wang
2026-01-22 22:03:24 +01:00
parent 4ea4bc96d4
commit 8fd61ea928
19 changed files with 4069 additions and 226 deletions

View File

@@ -2,6 +2,9 @@
Field Normalization Module
Normalizes field values to generate multiple candidate forms for matching.
This module generates variants of CSV values for matching against OCR text.
It uses shared utilities from src.utils for text cleaning and OCR error variants.
"""
import re
@@ -9,6 +12,10 @@ from dataclasses import dataclass
from datetime import datetime
from typing import Callable
# Import shared utilities
from src.utils.text_cleaner import TextCleaner
from src.utils.format_variants import FormatVariants
@dataclass
class NormalizedValue:
@@ -39,15 +46,11 @@ class FieldNormalizer:
@staticmethod
def clean_text(text: str) -> str:
"""Remove invisible characters and normalize whitespace and dashes."""
# Remove zero-width characters
text = re.sub(r'[\u200b\u200c\u200d\ufeff]', '', text)
# Normalize different dash types to standard hyphen-minus (ASCII 45)
# en-dash (, U+2013), em-dash (—, U+2014), minus sign (, U+2212), middle dot (·, U+00B7)
text = re.sub(r'[\u2013\u2014\u2212\u00b7]', '-', text)
# Normalize whitespace
text = ' '.join(text.split())
return text.strip()
"""Remove invisible characters and normalize whitespace and dashes.
Delegates to shared TextCleaner for consistency.
"""
return TextCleaner.clean_text(text)
@staticmethod
def normalize_invoice_number(value: str) -> list[str]:
@@ -81,57 +84,44 @@ class FieldNormalizer:
"""
Normalize Bankgiro number.
Uses shared FormatVariants plus OCR error variants.
Examples:
'5393-9484' -> ['5393-9484', '53939484']
'53939484' -> ['53939484', '5393-9484']
"""
value = FieldNormalizer.clean_text(value)
digits_only = re.sub(r'\D', '', value)
# Use shared module for base variants
variants = set(FormatVariants.bankgiro_variants(value))
variants = [value]
# Add OCR error variants
digits = TextCleaner.extract_digits(value, apply_ocr_correction=False)
if digits:
for ocr_var in TextCleaner.generate_ocr_variants(digits):
variants.add(ocr_var)
if digits_only:
# Add without dash
variants.append(digits_only)
# Add with dash (format: XXXX-XXXX for 8 digits)
if len(digits_only) == 8:
with_dash = f"{digits_only[:4]}-{digits_only[4:]}"
variants.append(with_dash)
elif len(digits_only) == 7:
# Some bankgiro numbers are 7 digits: XXX-XXXX
with_dash = f"{digits_only[:3]}-{digits_only[3:]}"
variants.append(with_dash)
return list(set(v for v in variants if v))
return list(v for v in variants if v)
@staticmethod
def normalize_plusgiro(value: str) -> list[str]:
"""
Normalize Plusgiro number.
Uses shared FormatVariants plus OCR error variants.
Examples:
'1234567-8' -> ['1234567-8', '12345678']
'12345678' -> ['12345678', '1234567-8']
"""
value = FieldNormalizer.clean_text(value)
digits_only = re.sub(r'\D', '', value)
# Use shared module for base variants
variants = set(FormatVariants.plusgiro_variants(value))
variants = [value]
# Add OCR error variants
digits = TextCleaner.extract_digits(value, apply_ocr_correction=False)
if digits:
for ocr_var in TextCleaner.generate_ocr_variants(digits):
variants.add(ocr_var)
if digits_only:
variants.append(digits_only)
# Plusgiro format: XXXXXXX-X (7 digits + check digit)
if len(digits_only) == 8:
with_dash = f"{digits_only[:-1]}-{digits_only[-1]}"
variants.append(with_dash)
# Also handle 6+1 format
elif len(digits_only) == 7:
with_dash = f"{digits_only[:-1]}-{digits_only[-1]}"
variants.append(with_dash)
return list(set(v for v in variants if v))
return list(v for v in variants if v)
@staticmethod
def normalize_organisation_number(value: str) -> list[str]:
@@ -141,60 +131,27 @@ class FieldNormalizer:
Organisation number format: NNNNNN-NNNN (6 digits + hyphen + 4 digits)
Swedish VAT format: SE + org_number (10 digits) + 01
Uses shared FormatVariants for comprehensive variant generation,
plus OCR error variants.
Examples:
'556123-4567' -> ['556123-4567', '5561234567', 'SE556123456701', ...]
'5561234567' -> ['5561234567', '556123-4567', 'SE556123456701', ...]
'SE556123456701' -> ['SE556123456701', '5561234567', '556123-4567', ...]
"""
value = FieldNormalizer.clean_text(value)
# Use shared module for base variants
variants = set(FormatVariants.organisation_number_variants(value))
# Check if input is a VAT number (starts with SE, ends with 01)
org_digits = None
if value.upper().startswith('SE') and len(value) >= 12:
# Extract org number from VAT: SE + 10 digits + 01
potential_org = re.sub(r'\D', '', value[2:]) # Remove SE prefix, keep digits
if len(potential_org) == 12 and potential_org.endswith('01'):
org_digits = potential_org[:-2] # Remove trailing 01
elif len(potential_org) == 10:
org_digits = potential_org
# Add OCR error variants for digit sequences
digits = TextCleaner.extract_digits(value, apply_ocr_correction=False)
if digits and len(digits) >= 10:
# Generate variants where OCR might have misread characters
for ocr_var in TextCleaner.generate_ocr_variants(digits[:10]):
variants.add(ocr_var)
if len(ocr_var) == 10:
variants.add(f"{ocr_var[:6]}-{ocr_var[6:]}")
if org_digits is None:
org_digits = re.sub(r'\D', '', value)
variants = [value]
if org_digits:
variants.append(org_digits)
# Standard format: NNNNNN-NNNN (10 digits total)
if len(org_digits) == 10:
with_dash = f"{org_digits[:6]}-{org_digits[6:]}"
variants.append(with_dash)
# Swedish VAT format: SE + org_number + 01
vat_number = f"SE{org_digits}01"
variants.append(vat_number)
variants.append(vat_number.lower()) # se556123456701
# With spaces: SE 5561234567 01
variants.append(f"SE {org_digits} 01")
variants.append(f"SE {org_digits[:6]}-{org_digits[6:]} 01")
# Without 01 suffix (some invoices show just SE + org)
variants.append(f"SE{org_digits}")
variants.append(f"SE {org_digits}")
# Some may have 12 digits (century prefix): NNNNNNNN-NNNN
elif len(org_digits) == 12:
with_dash = f"{org_digits[:8]}-{org_digits[8:]}"
variants.append(with_dash)
# Also try without century prefix
short_version = org_digits[2:]
variants.append(short_version)
variants.append(f"{short_version[:6]}-{short_version[6:]}")
# VAT with short version
vat_number = f"SE{short_version}01"
variants.append(vat_number)
return list(set(v for v in variants if v))
return list(v for v in variants if v)
@staticmethod
def normalize_supplier_accounts(value: str) -> list[str]: