187 lines
5.3 KiB
Python
187 lines
5.3 KiB
Python
"""
|
|
Field Normalization Module
|
|
|
|
Normalizes field values to generate multiple candidate forms for matching.
|
|
|
|
This module now delegates to individual normalizer modules for each field type.
|
|
Each normalizer is a separate, reusable module that can be used independently.
|
|
"""
|
|
|
|
from dataclasses import dataclass
|
|
from typing import Callable
|
|
from src.utils.text_cleaner import TextCleaner
|
|
|
|
# Import individual normalizers
|
|
from .normalizers import (
|
|
InvoiceNumberNormalizer,
|
|
OCRNormalizer,
|
|
BankgiroNormalizer,
|
|
PlusgiroNormalizer,
|
|
AmountNormalizer,
|
|
DateNormalizer,
|
|
OrganisationNumberNormalizer,
|
|
SupplierAccountsNormalizer,
|
|
CustomerNumberNormalizer,
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class NormalizedValue:
|
|
"""Represents a normalized value with its variants."""
|
|
original: str
|
|
variants: list[str]
|
|
field_type: str
|
|
|
|
|
|
class FieldNormalizer:
|
|
"""
|
|
Handles normalization of different invoice field types.
|
|
|
|
This class now acts as a facade that delegates to individual
|
|
normalizer modules. Each field type has its own specialized
|
|
normalizer for better modularity and reusability.
|
|
"""
|
|
|
|
# Instantiate individual normalizers
|
|
_invoice_number = InvoiceNumberNormalizer()
|
|
_ocr_number = OCRNormalizer()
|
|
_bankgiro = BankgiroNormalizer()
|
|
_plusgiro = PlusgiroNormalizer()
|
|
_amount = AmountNormalizer()
|
|
_date = DateNormalizer()
|
|
_organisation_number = OrganisationNumberNormalizer()
|
|
_supplier_accounts = SupplierAccountsNormalizer()
|
|
_customer_number = CustomerNumberNormalizer()
|
|
|
|
# Common Swedish month names for backward compatibility
|
|
SWEDISH_MONTHS = DateNormalizer.SWEDISH_MONTHS
|
|
|
|
@staticmethod
|
|
def clean_text(text: str) -> str:
|
|
"""
|
|
Remove invisible characters and normalize whitespace and dashes.
|
|
|
|
Delegates to shared TextCleaner for consistency.
|
|
"""
|
|
return TextCleaner.clean_text(text)
|
|
|
|
@staticmethod
|
|
def normalize_invoice_number(value: str) -> list[str]:
|
|
"""
|
|
Normalize invoice number.
|
|
|
|
Delegates to InvoiceNumberNormalizer.
|
|
"""
|
|
return FieldNormalizer._invoice_number.normalize(value)
|
|
|
|
@staticmethod
|
|
def normalize_ocr_number(value: str) -> list[str]:
|
|
"""
|
|
Normalize OCR number (Swedish payment reference).
|
|
|
|
Delegates to OCRNormalizer.
|
|
"""
|
|
return FieldNormalizer._ocr_number.normalize(value)
|
|
|
|
@staticmethod
|
|
def normalize_bankgiro(value: str) -> list[str]:
|
|
"""
|
|
Normalize Bankgiro number.
|
|
|
|
Delegates to BankgiroNormalizer.
|
|
"""
|
|
return FieldNormalizer._bankgiro.normalize(value)
|
|
|
|
@staticmethod
|
|
def normalize_plusgiro(value: str) -> list[str]:
|
|
"""
|
|
Normalize Plusgiro number.
|
|
|
|
Delegates to PlusgiroNormalizer.
|
|
"""
|
|
return FieldNormalizer._plusgiro.normalize(value)
|
|
|
|
@staticmethod
|
|
def normalize_organisation_number(value: str) -> list[str]:
|
|
"""
|
|
Normalize Swedish organisation number and generate VAT number variants.
|
|
|
|
Delegates to OrganisationNumberNormalizer.
|
|
"""
|
|
return FieldNormalizer._organisation_number.normalize(value)
|
|
|
|
@staticmethod
|
|
def normalize_supplier_accounts(value: str) -> list[str]:
|
|
"""
|
|
Normalize supplier accounts field.
|
|
|
|
Delegates to SupplierAccountsNormalizer.
|
|
"""
|
|
return FieldNormalizer._supplier_accounts.normalize(value)
|
|
|
|
@staticmethod
|
|
def normalize_customer_number(value: str) -> list[str]:
|
|
"""
|
|
Normalize customer number.
|
|
|
|
Delegates to CustomerNumberNormalizer.
|
|
"""
|
|
return FieldNormalizer._customer_number.normalize(value)
|
|
|
|
@staticmethod
|
|
def normalize_amount(value: str) -> list[str]:
|
|
"""
|
|
Normalize monetary amount.
|
|
|
|
Delegates to AmountNormalizer.
|
|
"""
|
|
return FieldNormalizer._amount.normalize(value)
|
|
|
|
@staticmethod
|
|
def normalize_date(value: str) -> list[str]:
|
|
"""
|
|
Normalize date to YYYY-MM-DD and generate variants.
|
|
|
|
Delegates to DateNormalizer.
|
|
"""
|
|
return FieldNormalizer._date.normalize(value)
|
|
|
|
|
|
# Field type to normalizer mapping
|
|
NORMALIZERS: dict[str, Callable[[str], list[str]]] = {
|
|
'InvoiceNumber': FieldNormalizer.normalize_invoice_number,
|
|
'OCR': FieldNormalizer.normalize_ocr_number,
|
|
'Bankgiro': FieldNormalizer.normalize_bankgiro,
|
|
'Plusgiro': FieldNormalizer.normalize_plusgiro,
|
|
'Amount': FieldNormalizer.normalize_amount,
|
|
'InvoiceDate': FieldNormalizer.normalize_date,
|
|
'InvoiceDueDate': FieldNormalizer.normalize_date,
|
|
'supplier_organisation_number': FieldNormalizer.normalize_organisation_number,
|
|
'supplier_accounts': FieldNormalizer.normalize_supplier_accounts,
|
|
'customer_number': FieldNormalizer.normalize_customer_number,
|
|
}
|
|
|
|
|
|
def normalize_field(field_name: str, value: str) -> list[str]:
|
|
"""
|
|
Normalize a field value based on its type.
|
|
|
|
Args:
|
|
field_name: Name of the field (e.g., 'InvoiceNumber', 'Amount')
|
|
value: Raw value to normalize
|
|
|
|
Returns:
|
|
List of normalized variants
|
|
"""
|
|
if value is None or (isinstance(value, str) and not value.strip()):
|
|
return []
|
|
|
|
value = str(value)
|
|
normalizer = NORMALIZERS.get(field_name)
|
|
|
|
if normalizer:
|
|
return normalizer(value)
|
|
|
|
# Default: just clean the text
|
|
return [FieldNormalizer.clean_text(value)]
|