Files
invoice-master-poc-v2/src/normalize/normalizer.py
2026-01-25 15:21:11 +01:00

187 lines
5.3 KiB
Python

"""
Field Normalization Module
Normalizes field values to generate multiple candidate forms for matching.
This module now delegates to individual normalizer modules for each field type.
Each normalizer is a separate, reusable module that can be used independently.
"""
from dataclasses import dataclass
from typing import Callable
from src.utils.text_cleaner import TextCleaner
# Import individual normalizers
from .normalizers import (
InvoiceNumberNormalizer,
OCRNormalizer,
BankgiroNormalizer,
PlusgiroNormalizer,
AmountNormalizer,
DateNormalizer,
OrganisationNumberNormalizer,
SupplierAccountsNormalizer,
CustomerNumberNormalizer,
)
@dataclass
class NormalizedValue:
"""Represents a normalized value with its variants."""
original: str
variants: list[str]
field_type: str
class FieldNormalizer:
"""
Handles normalization of different invoice field types.
This class now acts as a facade that delegates to individual
normalizer modules. Each field type has its own specialized
normalizer for better modularity and reusability.
"""
# Instantiate individual normalizers
_invoice_number = InvoiceNumberNormalizer()
_ocr_number = OCRNormalizer()
_bankgiro = BankgiroNormalizer()
_plusgiro = PlusgiroNormalizer()
_amount = AmountNormalizer()
_date = DateNormalizer()
_organisation_number = OrganisationNumberNormalizer()
_supplier_accounts = SupplierAccountsNormalizer()
_customer_number = CustomerNumberNormalizer()
# Common Swedish month names for backward compatibility
SWEDISH_MONTHS = DateNormalizer.SWEDISH_MONTHS
@staticmethod
def clean_text(text: str) -> str:
"""
Remove invisible characters and normalize whitespace and dashes.
Delegates to shared TextCleaner for consistency.
"""
return TextCleaner.clean_text(text)
@staticmethod
def normalize_invoice_number(value: str) -> list[str]:
"""
Normalize invoice number.
Delegates to InvoiceNumberNormalizer.
"""
return FieldNormalizer._invoice_number.normalize(value)
@staticmethod
def normalize_ocr_number(value: str) -> list[str]:
"""
Normalize OCR number (Swedish payment reference).
Delegates to OCRNormalizer.
"""
return FieldNormalizer._ocr_number.normalize(value)
@staticmethod
def normalize_bankgiro(value: str) -> list[str]:
"""
Normalize Bankgiro number.
Delegates to BankgiroNormalizer.
"""
return FieldNormalizer._bankgiro.normalize(value)
@staticmethod
def normalize_plusgiro(value: str) -> list[str]:
"""
Normalize Plusgiro number.
Delegates to PlusgiroNormalizer.
"""
return FieldNormalizer._plusgiro.normalize(value)
@staticmethod
def normalize_organisation_number(value: str) -> list[str]:
"""
Normalize Swedish organisation number and generate VAT number variants.
Delegates to OrganisationNumberNormalizer.
"""
return FieldNormalizer._organisation_number.normalize(value)
@staticmethod
def normalize_supplier_accounts(value: str) -> list[str]:
"""
Normalize supplier accounts field.
Delegates to SupplierAccountsNormalizer.
"""
return FieldNormalizer._supplier_accounts.normalize(value)
@staticmethod
def normalize_customer_number(value: str) -> list[str]:
"""
Normalize customer number.
Delegates to CustomerNumberNormalizer.
"""
return FieldNormalizer._customer_number.normalize(value)
@staticmethod
def normalize_amount(value: str) -> list[str]:
"""
Normalize monetary amount.
Delegates to AmountNormalizer.
"""
return FieldNormalizer._amount.normalize(value)
@staticmethod
def normalize_date(value: str) -> list[str]:
"""
Normalize date to YYYY-MM-DD and generate variants.
Delegates to DateNormalizer.
"""
return FieldNormalizer._date.normalize(value)
# Field type to normalizer mapping
NORMALIZERS: dict[str, Callable[[str], list[str]]] = {
'InvoiceNumber': FieldNormalizer.normalize_invoice_number,
'OCR': FieldNormalizer.normalize_ocr_number,
'Bankgiro': FieldNormalizer.normalize_bankgiro,
'Plusgiro': FieldNormalizer.normalize_plusgiro,
'Amount': FieldNormalizer.normalize_amount,
'InvoiceDate': FieldNormalizer.normalize_date,
'InvoiceDueDate': FieldNormalizer.normalize_date,
'supplier_organisation_number': FieldNormalizer.normalize_organisation_number,
'supplier_accounts': FieldNormalizer.normalize_supplier_accounts,
'customer_number': FieldNormalizer.normalize_customer_number,
}
def normalize_field(field_name: str, value: str) -> list[str]:
"""
Normalize a field value based on its type.
Args:
field_name: Name of the field (e.g., 'InvoiceNumber', 'Amount')
value: Raw value to normalize
Returns:
List of normalized variants
"""
if value is None or (isinstance(value, str) and not value.strip()):
return []
value = str(value)
normalizer = NORMALIZERS.get(field_name)
if normalizer:
return normalizer(value)
# Default: just clean the text
return [FieldNormalizer.clean_text(value)]