Re-structure the project.

This commit is contained in:
Yaojia Wang
2026-01-25 15:21:11 +01:00
parent 8fd61ea928
commit e599424a92
80 changed files with 10672 additions and 1584 deletions

View File

@@ -3,18 +3,26 @@ Field Normalization Module
Normalizes field values to generate multiple candidate forms for matching.
This module generates variants of CSV values for matching against OCR text.
It uses shared utilities from src.utils for text cleaning and OCR error variants.
This module now delegates to individual normalizer modules for each field type.
Each normalizer is a separate, reusable module that can be used independently.
"""
import re
from dataclasses import dataclass
from datetime import datetime
from typing import Callable
# Import shared utilities
from src.utils.text_cleaner import TextCleaner
from src.utils.format_variants import FormatVariants
# Import individual normalizers
from .normalizers import (
InvoiceNumberNormalizer,
OCRNormalizer,
BankgiroNormalizer,
PlusgiroNormalizer,
AmountNormalizer,
DateNormalizer,
OrganisationNumberNormalizer,
SupplierAccountsNormalizer,
CustomerNumberNormalizer,
)
@dataclass
@@ -26,27 +34,32 @@ class NormalizedValue:
class FieldNormalizer:
"""Handles normalization of different invoice field types."""
"""
Handles normalization of different invoice field types.
# Common Swedish month names for date parsing
SWEDISH_MONTHS = {
'januari': '01', 'jan': '01',
'februari': '02', 'feb': '02',
'mars': '03', 'mar': '03',
'april': '04', 'apr': '04',
'maj': '05',
'juni': '06', 'jun': '06',
'juli': '07', 'jul': '07',
'augusti': '08', 'aug': '08',
'september': '09', 'sep': '09', 'sept': '09',
'oktober': '10', 'okt': '10',
'november': '11', 'nov': '11',
'december': '12', 'dec': '12'
}
This class now acts as a facade that delegates to individual
normalizer modules. Each field type has its own specialized
normalizer for better modularity and reusability.
"""
# Instantiate individual normalizers
_invoice_number = InvoiceNumberNormalizer()
_ocr_number = OCRNormalizer()
_bankgiro = BankgiroNormalizer()
_plusgiro = PlusgiroNormalizer()
_amount = AmountNormalizer()
_date = DateNormalizer()
_organisation_number = OrganisationNumberNormalizer()
_supplier_accounts = SupplierAccountsNormalizer()
_customer_number = CustomerNumberNormalizer()
# Common Swedish month names for backward compatibility
SWEDISH_MONTHS = DateNormalizer.SWEDISH_MONTHS
@staticmethod
def clean_text(text: str) -> str:
"""Remove invisible characters and normalize whitespace and dashes.
"""
Remove invisible characters and normalize whitespace and dashes.
Delegates to shared TextCleaner for consistency.
"""
@@ -56,517 +69,82 @@ class FieldNormalizer:
def normalize_invoice_number(value: str) -> list[str]:
"""
Normalize invoice number.
Keeps only digits for matching.
Examples:
'100017500321' -> ['100017500321']
'INV-100017500321' -> ['100017500321', 'INV-100017500321']
Delegates to InvoiceNumberNormalizer.
"""
value = FieldNormalizer.clean_text(value)
digits_only = re.sub(r'\D', '', value)
variants = [value]
if digits_only and digits_only != value:
variants.append(digits_only)
return list(set(v for v in variants if v))
return FieldNormalizer._invoice_number.normalize(value)
@staticmethod
def normalize_ocr_number(value: str) -> list[str]:
"""
Normalize OCR number (Swedish payment reference).
Similar to invoice number - digits only.
Delegates to OCRNormalizer.
"""
return FieldNormalizer.normalize_invoice_number(value)
return FieldNormalizer._ocr_number.normalize(value)
@staticmethod
def normalize_bankgiro(value: str) -> list[str]:
"""
Normalize Bankgiro number.
Uses shared FormatVariants plus OCR error variants.
Examples:
'5393-9484' -> ['5393-9484', '53939484']
'53939484' -> ['53939484', '5393-9484']
Delegates to BankgiroNormalizer.
"""
# Use shared module for base variants
variants = set(FormatVariants.bankgiro_variants(value))
# Add OCR error variants
digits = TextCleaner.extract_digits(value, apply_ocr_correction=False)
if digits:
for ocr_var in TextCleaner.generate_ocr_variants(digits):
variants.add(ocr_var)
return list(v for v in variants if v)
return FieldNormalizer._bankgiro.normalize(value)
@staticmethod
def normalize_plusgiro(value: str) -> list[str]:
"""
Normalize Plusgiro number.
Uses shared FormatVariants plus OCR error variants.
Examples:
'1234567-8' -> ['1234567-8', '12345678']
'12345678' -> ['12345678', '1234567-8']
Delegates to PlusgiroNormalizer.
"""
# Use shared module for base variants
variants = set(FormatVariants.plusgiro_variants(value))
# Add OCR error variants
digits = TextCleaner.extract_digits(value, apply_ocr_correction=False)
if digits:
for ocr_var in TextCleaner.generate_ocr_variants(digits):
variants.add(ocr_var)
return list(v for v in variants if v)
return FieldNormalizer._plusgiro.normalize(value)
@staticmethod
def normalize_organisation_number(value: str) -> list[str]:
"""
Normalize Swedish organisation number and generate VAT number variants.
Organisation number format: NNNNNN-NNNN (6 digits + hyphen + 4 digits)
Swedish VAT format: SE + org_number (10 digits) + 01
Uses shared FormatVariants for comprehensive variant generation,
plus OCR error variants.
Examples:
'556123-4567' -> ['556123-4567', '5561234567', 'SE556123456701', ...]
'5561234567' -> ['5561234567', '556123-4567', 'SE556123456701', ...]
'SE556123456701' -> ['SE556123456701', '5561234567', '556123-4567', ...]
Delegates to OrganisationNumberNormalizer.
"""
# Use shared module for base variants
variants = set(FormatVariants.organisation_number_variants(value))
# Add OCR error variants for digit sequences
digits = TextCleaner.extract_digits(value, apply_ocr_correction=False)
if digits and len(digits) >= 10:
# Generate variants where OCR might have misread characters
for ocr_var in TextCleaner.generate_ocr_variants(digits[:10]):
variants.add(ocr_var)
if len(ocr_var) == 10:
variants.add(f"{ocr_var[:6]}-{ocr_var[6:]}")
return list(v for v in variants if v)
return FieldNormalizer._organisation_number.normalize(value)
@staticmethod
def normalize_supplier_accounts(value: str) -> list[str]:
"""
Normalize supplier accounts field.
The field may contain multiple accounts separated by ' | '.
Format examples:
'PG:48676043 | PG:49128028 | PG:8915035'
'BG:5393-9484'
Each account is normalized separately to generate variants.
Examples:
'PG:48676043' -> ['PG:48676043', '48676043', '4867604-3']
'BG:5393-9484' -> ['BG:5393-9484', '5393-9484', '53939484']
Delegates to SupplierAccountsNormalizer.
"""
value = FieldNormalizer.clean_text(value)
variants = []
# Split by ' | ' to handle multiple accounts
accounts = [acc.strip() for acc in value.split('|')]
for account in accounts:
account = account.strip()
if not account:
continue
# Add original value
variants.append(account)
# Remove prefix (PG:, BG:, etc.)
if ':' in account:
prefix, number = account.split(':', 1)
number = number.strip()
variants.append(number) # Just the number without prefix
# Also add with different prefix formats
prefix_upper = prefix.strip().upper()
variants.append(f"{prefix_upper}:{number}")
variants.append(f"{prefix_upper}: {number}") # With space
else:
number = account
# Extract digits only
digits_only = re.sub(r'\D', '', number)
if digits_only:
variants.append(digits_only)
# Plusgiro format: XXXXXXX-X (7 digits + check digit)
if len(digits_only) == 8:
with_dash = f"{digits_only[:-1]}-{digits_only[-1]}"
variants.append(with_dash)
# Also try 4-4 format for bankgiro
variants.append(f"{digits_only[:4]}-{digits_only[4:]}")
elif len(digits_only) == 7:
with_dash = f"{digits_only[:-1]}-{digits_only[-1]}"
variants.append(with_dash)
elif len(digits_only) == 10:
# 6-4 format (like org number)
variants.append(f"{digits_only[:6]}-{digits_only[6:]}")
return list(set(v for v in variants if v))
return FieldNormalizer._supplier_accounts.normalize(value)
@staticmethod
def normalize_customer_number(value: str) -> list[str]:
"""
Normalize customer number.
Customer numbers can have various formats:
- Alphanumeric codes: 'EMM 256-6', 'ABC123', 'A-1234'
- Pure numbers: '12345', '123-456'
Examples:
'EMM 256-6' -> ['EMM 256-6', 'EMM256-6', 'EMM2566']
'ABC 123' -> ['ABC 123', 'ABC123']
Delegates to CustomerNumberNormalizer.
"""
value = FieldNormalizer.clean_text(value)
variants = [value]
# Version without spaces
no_space = value.replace(' ', '')
if no_space != value:
variants.append(no_space)
# Version without dashes
no_dash = value.replace('-', '')
if no_dash != value:
variants.append(no_dash)
# Version without spaces and dashes
clean = value.replace(' ', '').replace('-', '')
if clean != value and clean not in variants:
variants.append(clean)
# Uppercase and lowercase versions
if value.upper() != value:
variants.append(value.upper())
if value.lower() != value:
variants.append(value.lower())
return list(set(v for v in variants if v))
return FieldNormalizer._customer_number.normalize(value)
@staticmethod
def normalize_amount(value: str) -> list[str]:
"""
Normalize monetary amount.
Examples:
'114' -> ['114', '114,00', '114.00']
'114,00' -> ['114,00', '114.00', '114']
'1 234,56' -> ['1234,56', '1234.56', '1 234,56']
'3045 52' -> ['3045.52', '3045,52', '304552'] (space as decimal sep)
Delegates to AmountNormalizer.
"""
value = FieldNormalizer.clean_text(value)
# Remove currency symbols and common suffixes
value = re.sub(r'[SEK|kr|:-]+$', '', value, flags=re.IGNORECASE).strip()
variants = [value]
# Check for space as decimal separator pattern: "3045 52" (number space 2-digits)
# This is common in Swedish invoices where space separates öre from kronor
space_decimal_match = re.match(r'^(\d+)\s+(\d{2})$', value)
if space_decimal_match:
integer_part = space_decimal_match.group(1)
decimal_part = space_decimal_match.group(2)
# Add variants with different decimal separators
variants.append(f"{integer_part}.{decimal_part}")
variants.append(f"{integer_part},{decimal_part}")
variants.append(f"{integer_part}{decimal_part}") # No separator
# Check for space as thousand separator with decimal: "10 571,00" or "10 571.00"
# Pattern: digits space digits comma/dot 2-digits
space_thousand_match = re.match(r'^(\d{1,3})[\s\xa0]+(\d{3})([,\.])(\d{2})$', value)
if space_thousand_match:
part1 = space_thousand_match.group(1)
part2 = space_thousand_match.group(2)
sep = space_thousand_match.group(3)
decimal = space_thousand_match.group(4)
combined = f"{part1}{part2}"
variants.append(f"{combined}.{decimal}")
variants.append(f"{combined},{decimal}")
variants.append(f"{combined}{decimal}")
# Also add variant with space preserved but different decimal sep
other_sep = ',' if sep == '.' else '.'
variants.append(f"{part1} {part2}{other_sep}{decimal}")
# Handle US format: "1,390.00" (comma as thousand separator, dot as decimal)
us_format_match = re.match(r'^(\d{1,3}),(\d{3})\.(\d{2})$', value)
if us_format_match:
part1 = us_format_match.group(1)
part2 = us_format_match.group(2)
decimal = us_format_match.group(3)
combined = f"{part1}{part2}"
variants.append(f"{combined}.{decimal}")
variants.append(f"{combined},{decimal}")
variants.append(combined) # Without decimal
# European format: 1.390,00
variants.append(f"{part1}.{part2},{decimal}")
# Handle European format: "1.390,00" (dot as thousand separator, comma as decimal)
eu_format_match = re.match(r'^(\d{1,3})\.(\d{3}),(\d{2})$', value)
if eu_format_match:
part1 = eu_format_match.group(1)
part2 = eu_format_match.group(2)
decimal = eu_format_match.group(3)
combined = f"{part1}{part2}"
variants.append(f"{combined}.{decimal}")
variants.append(f"{combined},{decimal}")
variants.append(combined) # Without decimal
# US format: 1,390.00
variants.append(f"{part1},{part2}.{decimal}")
# Remove spaces (thousand separators) including non-breaking space
no_space = value.replace(' ', '').replace('\xa0', '')
# Normalize decimal separator
if ',' in no_space:
dot_version = no_space.replace(',', '.')
variants.append(no_space)
variants.append(dot_version)
elif '.' in no_space:
comma_version = no_space.replace('.', ',')
variants.append(no_space)
variants.append(comma_version)
else:
# Integer amount - add decimal versions
variants.append(no_space)
variants.append(f"{no_space},00")
variants.append(f"{no_space}.00")
# Try to parse and get clean numeric value
try:
# Parse as float
clean = no_space.replace(',', '.')
num = float(clean)
# Integer if no decimals
if num == int(num):
int_val = int(num)
variants.append(str(int_val))
variants.append(f"{int_val},00")
variants.append(f"{int_val}.00")
# European format with dot as thousand separator (e.g., 20.485,00)
if int_val >= 1000:
# Format: XX.XXX,XX
formatted = f"{int_val:,}".replace(',', '.')
variants.append(formatted) # 20.485
variants.append(f"{formatted},00") # 20.485,00
else:
variants.append(f"{num:.2f}")
variants.append(f"{num:.2f}".replace('.', ','))
# European format with dot as thousand separator
if num >= 1000:
# Split integer and decimal parts using string formatting to avoid precision loss
formatted_str = f"{num:.2f}"
int_str, dec_str = formatted_str.split(".")
int_part = int(int_str)
formatted_int = f"{int_part:,}".replace(',', '.')
variants.append(f"{formatted_int},{dec_str}") # 3.045,52
except ValueError:
pass
return list(set(v for v in variants if v))
return FieldNormalizer._amount.normalize(value)
@staticmethod
def normalize_date(value: str) -> list[str]:
"""
Normalize date to YYYY-MM-DD and generate variants.
Handles:
'2025-12-13' -> ['2025-12-13', '13/12/2025', '13.12.2025']
'13/12/2025' -> ['2025-12-13', '13/12/2025', ...]
'13 december 2025' -> ['2025-12-13', ...]
Note: For ambiguous formats like DD/MM/YYYY vs MM/DD/YYYY,
we generate variants for BOTH interpretations to maximize matching.
Delegates to DateNormalizer.
"""
value = FieldNormalizer.clean_text(value)
variants = [value]
parsed_dates = [] # May have multiple interpretations
# Try different date formats
date_patterns = [
# ISO format with optional time (e.g., 2026-01-09 00:00:00)
(r'^(\d{4})-(\d{1,2})-(\d{1,2})(?:\s+\d{1,2}:\d{2}:\d{2})?$', lambda m: (int(m[1]), int(m[2]), int(m[3]))),
# Swedish format: YYMMDD
(r'^(\d{2})(\d{2})(\d{2})$', lambda m: (2000 + int(m[1]) if int(m[1]) < 50 else 1900 + int(m[1]), int(m[2]), int(m[3]))),
# Swedish format: YYYYMMDD
(r'^(\d{4})(\d{2})(\d{2})$', lambda m: (int(m[1]), int(m[2]), int(m[3]))),
]
# Ambiguous patterns - try both DD/MM and MM/DD interpretations
ambiguous_patterns_4digit_year = [
# Format with / - could be DD/MM/YYYY (European) or MM/DD/YYYY (US)
r'^(\d{1,2})/(\d{1,2})/(\d{4})$',
# Format with . - typically European DD.MM.YYYY
r'^(\d{1,2})\.(\d{1,2})\.(\d{4})$',
# Format with - (not ISO) - could be DD-MM-YYYY or MM-DD-YYYY
r'^(\d{1,2})-(\d{1,2})-(\d{4})$',
]
# Patterns with 2-digit year (common in Swedish invoices)
ambiguous_patterns_2digit_year = [
# Format DD.MM.YY (e.g., 02.08.25 for 2025-08-02)
r'^(\d{1,2})\.(\d{1,2})\.(\d{2})$',
# Format DD/MM/YY
r'^(\d{1,2})/(\d{1,2})/(\d{2})$',
# Format DD-MM-YY
r'^(\d{1,2})-(\d{1,2})-(\d{2})$',
]
# Try unambiguous patterns first
for pattern, extractor in date_patterns:
match = re.match(pattern, value)
if match:
try:
year, month, day = extractor(match)
parsed_dates.append(datetime(year, month, day))
break
except ValueError:
continue
# Try ambiguous patterns with 4-digit year
if not parsed_dates:
for pattern in ambiguous_patterns_4digit_year:
match = re.match(pattern, value)
if match:
n1, n2, year = int(match[1]), int(match[2]), int(match[3])
# Try DD/MM/YYYY (European - day first)
try:
parsed_dates.append(datetime(year, n2, n1))
except ValueError:
pass
# Try MM/DD/YYYY (US - month first) if different and valid
if n1 != n2:
try:
parsed_dates.append(datetime(year, n1, n2))
except ValueError:
pass
if parsed_dates:
break
# Try ambiguous patterns with 2-digit year (e.g., 02.08.25)
if not parsed_dates:
for pattern in ambiguous_patterns_2digit_year:
match = re.match(pattern, value)
if match:
n1, n2, yy = int(match[1]), int(match[2]), int(match[3])
# Convert 2-digit year to 4-digit (00-49 -> 2000s, 50-99 -> 1900s)
year = 2000 + yy if yy < 50 else 1900 + yy
# Try DD/MM/YY (European - day first, most common in Sweden)
try:
parsed_dates.append(datetime(year, n2, n1))
except ValueError:
pass
# Try MM/DD/YY (US - month first) if different and valid
if n1 != n2:
try:
parsed_dates.append(datetime(year, n1, n2))
except ValueError:
pass
if parsed_dates:
break
# Try Swedish month names
if not parsed_dates:
for month_name, month_num in FieldNormalizer.SWEDISH_MONTHS.items():
if month_name in value.lower():
# Extract day and year
numbers = re.findall(r'\d+', value)
if len(numbers) >= 2:
day = int(numbers[0])
year = int(numbers[-1])
if year < 100:
year = 2000 + year if year < 50 else 1900 + year
try:
parsed_dates.append(datetime(year, int(month_num), day))
break
except ValueError:
continue
# Generate variants for all parsed date interpretations
swedish_months_full = [
'januari', 'februari', 'mars', 'april', 'maj', 'juni',
'juli', 'augusti', 'september', 'oktober', 'november', 'december'
]
swedish_months_abbrev = [
'jan', 'feb', 'mar', 'apr', 'maj', 'jun',
'jul', 'aug', 'sep', 'okt', 'nov', 'dec'
]
for parsed_date in parsed_dates:
# Generate different formats
iso = parsed_date.strftime('%Y-%m-%d')
eu_slash = parsed_date.strftime('%d/%m/%Y')
us_slash = parsed_date.strftime('%m/%d/%Y') # US format MM/DD/YYYY
eu_dot = parsed_date.strftime('%d.%m.%Y')
iso_dot = parsed_date.strftime('%Y.%m.%d') # ISO with dots (e.g., 2024.02.08)
compact = parsed_date.strftime('%Y%m%d') # YYYYMMDD
compact_short = parsed_date.strftime('%y%m%d') # YYMMDD (e.g., 260108)
# Short year with dot separator (e.g., 02.01.26)
eu_dot_short = parsed_date.strftime('%d.%m.%y')
# Short year with slash separator (e.g., 20/10/24) - DD/MM/YY format
eu_slash_short = parsed_date.strftime('%d/%m/%y')
# Short year with hyphen separator (e.g., 23-11-01) - common in Swedish invoices
yy_mm_dd_short = parsed_date.strftime('%y-%m-%d')
# Middle dot separator (OCR sometimes reads hyphens as middle dots)
iso_middot = parsed_date.strftime('%%%d')
# Spaced formats (e.g., "2026 01 12", "26 01 12")
spaced_full = parsed_date.strftime('%Y %m %d')
spaced_short = parsed_date.strftime('%y %m %d')
# Swedish month name formats (e.g., "9 januari 2026", "9 jan 2026")
month_full = swedish_months_full[parsed_date.month - 1]
month_abbrev = swedish_months_abbrev[parsed_date.month - 1]
swedish_format_full = f"{parsed_date.day} {month_full} {parsed_date.year}"
swedish_format_abbrev = f"{parsed_date.day} {month_abbrev} {parsed_date.year}"
# Swedish month abbreviation with hyphen (e.g., "30-OKT-24", "30-okt-24")
month_abbrev_upper = month_abbrev.upper()
swedish_hyphen_short = f"{parsed_date.day:02d}-{month_abbrev_upper}-{parsed_date.strftime('%y')}"
swedish_hyphen_short_lower = f"{parsed_date.day:02d}-{month_abbrev}-{parsed_date.strftime('%y')}"
# Also without leading zero on day
swedish_hyphen_short_no_zero = f"{parsed_date.day}-{month_abbrev_upper}-{parsed_date.strftime('%y')}"
# Swedish month abbreviation with short year in different format (e.g., "SEP-24", "30 SEP 24")
month_year_only = f"{month_abbrev_upper}-{parsed_date.strftime('%y')}"
swedish_spaced = f"{parsed_date.day:02d} {month_abbrev_upper} {parsed_date.strftime('%y')}"
variants.extend([
iso, eu_slash, us_slash, eu_dot, iso_dot, compact, compact_short,
eu_dot_short, eu_slash_short, yy_mm_dd_short, iso_middot, spaced_full, spaced_short,
swedish_format_full, swedish_format_abbrev,
swedish_hyphen_short, swedish_hyphen_short_lower, swedish_hyphen_short_no_zero,
month_year_only, swedish_spaced
])
return list(set(v for v in variants if v))
return FieldNormalizer._date.normalize(value)
# Field type to normalizer mapping