WIP
This commit is contained in:
@@ -2,6 +2,9 @@
|
||||
Field Normalization Module
|
||||
|
||||
Normalizes field values to generate multiple candidate forms for matching.
|
||||
|
||||
This module generates variants of CSV values for matching against OCR text.
|
||||
It uses shared utilities from src.utils for text cleaning and OCR error variants.
|
||||
"""
|
||||
|
||||
import re
|
||||
@@ -9,6 +12,10 @@ from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import Callable
|
||||
|
||||
# Import shared utilities
|
||||
from src.utils.text_cleaner import TextCleaner
|
||||
from src.utils.format_variants import FormatVariants
|
||||
|
||||
|
||||
@dataclass
|
||||
class NormalizedValue:
|
||||
@@ -39,15 +46,11 @@ class FieldNormalizer:
|
||||
|
||||
@staticmethod
|
||||
def clean_text(text: str) -> str:
|
||||
"""Remove invisible characters and normalize whitespace and dashes."""
|
||||
# Remove zero-width characters
|
||||
text = re.sub(r'[\u200b\u200c\u200d\ufeff]', '', text)
|
||||
# Normalize different dash types to standard hyphen-minus (ASCII 45)
|
||||
# en-dash (–, U+2013), em-dash (—, U+2014), minus sign (−, U+2212), middle dot (·, U+00B7)
|
||||
text = re.sub(r'[\u2013\u2014\u2212\u00b7]', '-', text)
|
||||
# Normalize whitespace
|
||||
text = ' '.join(text.split())
|
||||
return text.strip()
|
||||
"""Remove invisible characters and normalize whitespace and dashes.
|
||||
|
||||
Delegates to shared TextCleaner for consistency.
|
||||
"""
|
||||
return TextCleaner.clean_text(text)
|
||||
|
||||
@staticmethod
|
||||
def normalize_invoice_number(value: str) -> list[str]:
|
||||
@@ -81,57 +84,44 @@ class FieldNormalizer:
|
||||
"""
|
||||
Normalize Bankgiro number.
|
||||
|
||||
Uses shared FormatVariants plus OCR error variants.
|
||||
|
||||
Examples:
|
||||
'5393-9484' -> ['5393-9484', '53939484']
|
||||
'53939484' -> ['53939484', '5393-9484']
|
||||
"""
|
||||
value = FieldNormalizer.clean_text(value)
|
||||
digits_only = re.sub(r'\D', '', value)
|
||||
# Use shared module for base variants
|
||||
variants = set(FormatVariants.bankgiro_variants(value))
|
||||
|
||||
variants = [value]
|
||||
# Add OCR error variants
|
||||
digits = TextCleaner.extract_digits(value, apply_ocr_correction=False)
|
||||
if digits:
|
||||
for ocr_var in TextCleaner.generate_ocr_variants(digits):
|
||||
variants.add(ocr_var)
|
||||
|
||||
if digits_only:
|
||||
# Add without dash
|
||||
variants.append(digits_only)
|
||||
|
||||
# Add with dash (format: XXXX-XXXX for 8 digits)
|
||||
if len(digits_only) == 8:
|
||||
with_dash = f"{digits_only[:4]}-{digits_only[4:]}"
|
||||
variants.append(with_dash)
|
||||
elif len(digits_only) == 7:
|
||||
# Some bankgiro numbers are 7 digits: XXX-XXXX
|
||||
with_dash = f"{digits_only[:3]}-{digits_only[3:]}"
|
||||
variants.append(with_dash)
|
||||
|
||||
return list(set(v for v in variants if v))
|
||||
return list(v for v in variants if v)
|
||||
|
||||
@staticmethod
|
||||
def normalize_plusgiro(value: str) -> list[str]:
|
||||
"""
|
||||
Normalize Plusgiro number.
|
||||
|
||||
Uses shared FormatVariants plus OCR error variants.
|
||||
|
||||
Examples:
|
||||
'1234567-8' -> ['1234567-8', '12345678']
|
||||
'12345678' -> ['12345678', '1234567-8']
|
||||
"""
|
||||
value = FieldNormalizer.clean_text(value)
|
||||
digits_only = re.sub(r'\D', '', value)
|
||||
# Use shared module for base variants
|
||||
variants = set(FormatVariants.plusgiro_variants(value))
|
||||
|
||||
variants = [value]
|
||||
# Add OCR error variants
|
||||
digits = TextCleaner.extract_digits(value, apply_ocr_correction=False)
|
||||
if digits:
|
||||
for ocr_var in TextCleaner.generate_ocr_variants(digits):
|
||||
variants.add(ocr_var)
|
||||
|
||||
if digits_only:
|
||||
variants.append(digits_only)
|
||||
|
||||
# Plusgiro format: XXXXXXX-X (7 digits + check digit)
|
||||
if len(digits_only) == 8:
|
||||
with_dash = f"{digits_only[:-1]}-{digits_only[-1]}"
|
||||
variants.append(with_dash)
|
||||
# Also handle 6+1 format
|
||||
elif len(digits_only) == 7:
|
||||
with_dash = f"{digits_only[:-1]}-{digits_only[-1]}"
|
||||
variants.append(with_dash)
|
||||
|
||||
return list(set(v for v in variants if v))
|
||||
return list(v for v in variants if v)
|
||||
|
||||
@staticmethod
|
||||
def normalize_organisation_number(value: str) -> list[str]:
|
||||
@@ -141,60 +131,27 @@ class FieldNormalizer:
|
||||
Organisation number format: NNNNNN-NNNN (6 digits + hyphen + 4 digits)
|
||||
Swedish VAT format: SE + org_number (10 digits) + 01
|
||||
|
||||
Uses shared FormatVariants for comprehensive variant generation,
|
||||
plus OCR error variants.
|
||||
|
||||
Examples:
|
||||
'556123-4567' -> ['556123-4567', '5561234567', 'SE556123456701', ...]
|
||||
'5561234567' -> ['5561234567', '556123-4567', 'SE556123456701', ...]
|
||||
'SE556123456701' -> ['SE556123456701', '5561234567', '556123-4567', ...]
|
||||
"""
|
||||
value = FieldNormalizer.clean_text(value)
|
||||
# Use shared module for base variants
|
||||
variants = set(FormatVariants.organisation_number_variants(value))
|
||||
|
||||
# Check if input is a VAT number (starts with SE, ends with 01)
|
||||
org_digits = None
|
||||
if value.upper().startswith('SE') and len(value) >= 12:
|
||||
# Extract org number from VAT: SE + 10 digits + 01
|
||||
potential_org = re.sub(r'\D', '', value[2:]) # Remove SE prefix, keep digits
|
||||
if len(potential_org) == 12 and potential_org.endswith('01'):
|
||||
org_digits = potential_org[:-2] # Remove trailing 01
|
||||
elif len(potential_org) == 10:
|
||||
org_digits = potential_org
|
||||
# Add OCR error variants for digit sequences
|
||||
digits = TextCleaner.extract_digits(value, apply_ocr_correction=False)
|
||||
if digits and len(digits) >= 10:
|
||||
# Generate variants where OCR might have misread characters
|
||||
for ocr_var in TextCleaner.generate_ocr_variants(digits[:10]):
|
||||
variants.add(ocr_var)
|
||||
if len(ocr_var) == 10:
|
||||
variants.add(f"{ocr_var[:6]}-{ocr_var[6:]}")
|
||||
|
||||
if org_digits is None:
|
||||
org_digits = re.sub(r'\D', '', value)
|
||||
|
||||
variants = [value]
|
||||
|
||||
if org_digits:
|
||||
variants.append(org_digits)
|
||||
|
||||
# Standard format: NNNNNN-NNNN (10 digits total)
|
||||
if len(org_digits) == 10:
|
||||
with_dash = f"{org_digits[:6]}-{org_digits[6:]}"
|
||||
variants.append(with_dash)
|
||||
|
||||
# Swedish VAT format: SE + org_number + 01
|
||||
vat_number = f"SE{org_digits}01"
|
||||
variants.append(vat_number)
|
||||
variants.append(vat_number.lower()) # se556123456701
|
||||
# With spaces: SE 5561234567 01
|
||||
variants.append(f"SE {org_digits} 01")
|
||||
variants.append(f"SE {org_digits[:6]}-{org_digits[6:]} 01")
|
||||
# Without 01 suffix (some invoices show just SE + org)
|
||||
variants.append(f"SE{org_digits}")
|
||||
variants.append(f"SE {org_digits}")
|
||||
|
||||
# Some may have 12 digits (century prefix): NNNNNNNN-NNNN
|
||||
elif len(org_digits) == 12:
|
||||
with_dash = f"{org_digits[:8]}-{org_digits[8:]}"
|
||||
variants.append(with_dash)
|
||||
# Also try without century prefix
|
||||
short_version = org_digits[2:]
|
||||
variants.append(short_version)
|
||||
variants.append(f"{short_version[:6]}-{short_version[6:]}")
|
||||
# VAT with short version
|
||||
vat_number = f"SE{short_version}01"
|
||||
variants.append(vat_number)
|
||||
|
||||
return list(set(v for v in variants if v))
|
||||
return list(v for v in variants if v)
|
||||
|
||||
@staticmethod
|
||||
def normalize_supplier_accounts(value: str) -> list[str]:
|
||||
|
||||
Reference in New Issue
Block a user