Re-structure the project.
This commit is contained in:
@@ -3,18 +3,26 @@ Field Normalization Module
|
||||
|
||||
Normalizes field values to generate multiple candidate forms for matching.
|
||||
|
||||
This module generates variants of CSV values for matching against OCR text.
|
||||
It uses shared utilities from src.utils for text cleaning and OCR error variants.
|
||||
This module now delegates to individual normalizer modules for each field type.
|
||||
Each normalizer is a separate, reusable module that can be used independently.
|
||||
"""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import Callable
|
||||
|
||||
# Import shared utilities
|
||||
from src.utils.text_cleaner import TextCleaner
|
||||
from src.utils.format_variants import FormatVariants
|
||||
|
||||
# Import individual normalizers
|
||||
from .normalizers import (
|
||||
InvoiceNumberNormalizer,
|
||||
OCRNormalizer,
|
||||
BankgiroNormalizer,
|
||||
PlusgiroNormalizer,
|
||||
AmountNormalizer,
|
||||
DateNormalizer,
|
||||
OrganisationNumberNormalizer,
|
||||
SupplierAccountsNormalizer,
|
||||
CustomerNumberNormalizer,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -26,27 +34,32 @@ class NormalizedValue:
|
||||
|
||||
|
||||
class FieldNormalizer:
|
||||
"""Handles normalization of different invoice field types."""
|
||||
"""
|
||||
Handles normalization of different invoice field types.
|
||||
|
||||
# Common Swedish month names for date parsing
|
||||
SWEDISH_MONTHS = {
|
||||
'januari': '01', 'jan': '01',
|
||||
'februari': '02', 'feb': '02',
|
||||
'mars': '03', 'mar': '03',
|
||||
'april': '04', 'apr': '04',
|
||||
'maj': '05',
|
||||
'juni': '06', 'jun': '06',
|
||||
'juli': '07', 'jul': '07',
|
||||
'augusti': '08', 'aug': '08',
|
||||
'september': '09', 'sep': '09', 'sept': '09',
|
||||
'oktober': '10', 'okt': '10',
|
||||
'november': '11', 'nov': '11',
|
||||
'december': '12', 'dec': '12'
|
||||
}
|
||||
This class now acts as a facade that delegates to individual
|
||||
normalizer modules. Each field type has its own specialized
|
||||
normalizer for better modularity and reusability.
|
||||
"""
|
||||
|
||||
# Instantiate individual normalizers
|
||||
_invoice_number = InvoiceNumberNormalizer()
|
||||
_ocr_number = OCRNormalizer()
|
||||
_bankgiro = BankgiroNormalizer()
|
||||
_plusgiro = PlusgiroNormalizer()
|
||||
_amount = AmountNormalizer()
|
||||
_date = DateNormalizer()
|
||||
_organisation_number = OrganisationNumberNormalizer()
|
||||
_supplier_accounts = SupplierAccountsNormalizer()
|
||||
_customer_number = CustomerNumberNormalizer()
|
||||
|
||||
# Common Swedish month names for backward compatibility
|
||||
SWEDISH_MONTHS = DateNormalizer.SWEDISH_MONTHS
|
||||
|
||||
@staticmethod
|
||||
def clean_text(text: str) -> str:
|
||||
"""Remove invisible characters and normalize whitespace and dashes.
|
||||
"""
|
||||
Remove invisible characters and normalize whitespace and dashes.
|
||||
|
||||
Delegates to shared TextCleaner for consistency.
|
||||
"""
|
||||
@@ -56,517 +69,82 @@ class FieldNormalizer:
|
||||
def normalize_invoice_number(value: str) -> list[str]:
|
||||
"""
|
||||
Normalize invoice number.
|
||||
Keeps only digits for matching.
|
||||
|
||||
Examples:
|
||||
'100017500321' -> ['100017500321']
|
||||
'INV-100017500321' -> ['100017500321', 'INV-100017500321']
|
||||
Delegates to InvoiceNumberNormalizer.
|
||||
"""
|
||||
value = FieldNormalizer.clean_text(value)
|
||||
digits_only = re.sub(r'\D', '', value)
|
||||
|
||||
variants = [value]
|
||||
if digits_only and digits_only != value:
|
||||
variants.append(digits_only)
|
||||
|
||||
return list(set(v for v in variants if v))
|
||||
return FieldNormalizer._invoice_number.normalize(value)
|
||||
|
||||
@staticmethod
|
||||
def normalize_ocr_number(value: str) -> list[str]:
|
||||
"""
|
||||
Normalize OCR number (Swedish payment reference).
|
||||
Similar to invoice number - digits only.
|
||||
|
||||
Delegates to OCRNormalizer.
|
||||
"""
|
||||
return FieldNormalizer.normalize_invoice_number(value)
|
||||
return FieldNormalizer._ocr_number.normalize(value)
|
||||
|
||||
@staticmethod
|
||||
def normalize_bankgiro(value: str) -> list[str]:
|
||||
"""
|
||||
Normalize Bankgiro number.
|
||||
|
||||
Uses shared FormatVariants plus OCR error variants.
|
||||
|
||||
Examples:
|
||||
'5393-9484' -> ['5393-9484', '53939484']
|
||||
'53939484' -> ['53939484', '5393-9484']
|
||||
Delegates to BankgiroNormalizer.
|
||||
"""
|
||||
# Use shared module for base variants
|
||||
variants = set(FormatVariants.bankgiro_variants(value))
|
||||
|
||||
# Add OCR error variants
|
||||
digits = TextCleaner.extract_digits(value, apply_ocr_correction=False)
|
||||
if digits:
|
||||
for ocr_var in TextCleaner.generate_ocr_variants(digits):
|
||||
variants.add(ocr_var)
|
||||
|
||||
return list(v for v in variants if v)
|
||||
return FieldNormalizer._bankgiro.normalize(value)
|
||||
|
||||
@staticmethod
|
||||
def normalize_plusgiro(value: str) -> list[str]:
|
||||
"""
|
||||
Normalize Plusgiro number.
|
||||
|
||||
Uses shared FormatVariants plus OCR error variants.
|
||||
|
||||
Examples:
|
||||
'1234567-8' -> ['1234567-8', '12345678']
|
||||
'12345678' -> ['12345678', '1234567-8']
|
||||
Delegates to PlusgiroNormalizer.
|
||||
"""
|
||||
# Use shared module for base variants
|
||||
variants = set(FormatVariants.plusgiro_variants(value))
|
||||
|
||||
# Add OCR error variants
|
||||
digits = TextCleaner.extract_digits(value, apply_ocr_correction=False)
|
||||
if digits:
|
||||
for ocr_var in TextCleaner.generate_ocr_variants(digits):
|
||||
variants.add(ocr_var)
|
||||
|
||||
return list(v for v in variants if v)
|
||||
return FieldNormalizer._plusgiro.normalize(value)
|
||||
|
||||
@staticmethod
|
||||
def normalize_organisation_number(value: str) -> list[str]:
|
||||
"""
|
||||
Normalize Swedish organisation number and generate VAT number variants.
|
||||
|
||||
Organisation number format: NNNNNN-NNNN (6 digits + hyphen + 4 digits)
|
||||
Swedish VAT format: SE + org_number (10 digits) + 01
|
||||
|
||||
Uses shared FormatVariants for comprehensive variant generation,
|
||||
plus OCR error variants.
|
||||
|
||||
Examples:
|
||||
'556123-4567' -> ['556123-4567', '5561234567', 'SE556123456701', ...]
|
||||
'5561234567' -> ['5561234567', '556123-4567', 'SE556123456701', ...]
|
||||
'SE556123456701' -> ['SE556123456701', '5561234567', '556123-4567', ...]
|
||||
Delegates to OrganisationNumberNormalizer.
|
||||
"""
|
||||
# Use shared module for base variants
|
||||
variants = set(FormatVariants.organisation_number_variants(value))
|
||||
|
||||
# Add OCR error variants for digit sequences
|
||||
digits = TextCleaner.extract_digits(value, apply_ocr_correction=False)
|
||||
if digits and len(digits) >= 10:
|
||||
# Generate variants where OCR might have misread characters
|
||||
for ocr_var in TextCleaner.generate_ocr_variants(digits[:10]):
|
||||
variants.add(ocr_var)
|
||||
if len(ocr_var) == 10:
|
||||
variants.add(f"{ocr_var[:6]}-{ocr_var[6:]}")
|
||||
|
||||
return list(v for v in variants if v)
|
||||
return FieldNormalizer._organisation_number.normalize(value)
|
||||
|
||||
@staticmethod
|
||||
def normalize_supplier_accounts(value: str) -> list[str]:
|
||||
"""
|
||||
Normalize supplier accounts field.
|
||||
|
||||
The field may contain multiple accounts separated by ' | '.
|
||||
Format examples:
|
||||
'PG:48676043 | PG:49128028 | PG:8915035'
|
||||
'BG:5393-9484'
|
||||
|
||||
Each account is normalized separately to generate variants.
|
||||
|
||||
Examples:
|
||||
'PG:48676043' -> ['PG:48676043', '48676043', '4867604-3']
|
||||
'BG:5393-9484' -> ['BG:5393-9484', '5393-9484', '53939484']
|
||||
Delegates to SupplierAccountsNormalizer.
|
||||
"""
|
||||
value = FieldNormalizer.clean_text(value)
|
||||
variants = []
|
||||
|
||||
# Split by ' | ' to handle multiple accounts
|
||||
accounts = [acc.strip() for acc in value.split('|')]
|
||||
|
||||
for account in accounts:
|
||||
account = account.strip()
|
||||
if not account:
|
||||
continue
|
||||
|
||||
# Add original value
|
||||
variants.append(account)
|
||||
|
||||
# Remove prefix (PG:, BG:, etc.)
|
||||
if ':' in account:
|
||||
prefix, number = account.split(':', 1)
|
||||
number = number.strip()
|
||||
variants.append(number) # Just the number without prefix
|
||||
|
||||
# Also add with different prefix formats
|
||||
prefix_upper = prefix.strip().upper()
|
||||
variants.append(f"{prefix_upper}:{number}")
|
||||
variants.append(f"{prefix_upper}: {number}") # With space
|
||||
else:
|
||||
number = account
|
||||
|
||||
# Extract digits only
|
||||
digits_only = re.sub(r'\D', '', number)
|
||||
|
||||
if digits_only:
|
||||
variants.append(digits_only)
|
||||
|
||||
# Plusgiro format: XXXXXXX-X (7 digits + check digit)
|
||||
if len(digits_only) == 8:
|
||||
with_dash = f"{digits_only[:-1]}-{digits_only[-1]}"
|
||||
variants.append(with_dash)
|
||||
# Also try 4-4 format for bankgiro
|
||||
variants.append(f"{digits_only[:4]}-{digits_only[4:]}")
|
||||
elif len(digits_only) == 7:
|
||||
with_dash = f"{digits_only[:-1]}-{digits_only[-1]}"
|
||||
variants.append(with_dash)
|
||||
elif len(digits_only) == 10:
|
||||
# 6-4 format (like org number)
|
||||
variants.append(f"{digits_only[:6]}-{digits_only[6:]}")
|
||||
|
||||
return list(set(v for v in variants if v))
|
||||
return FieldNormalizer._supplier_accounts.normalize(value)
|
||||
|
||||
@staticmethod
|
||||
def normalize_customer_number(value: str) -> list[str]:
|
||||
"""
|
||||
Normalize customer number.
|
||||
|
||||
Customer numbers can have various formats:
|
||||
- Alphanumeric codes: 'EMM 256-6', 'ABC123', 'A-1234'
|
||||
- Pure numbers: '12345', '123-456'
|
||||
|
||||
Examples:
|
||||
'EMM 256-6' -> ['EMM 256-6', 'EMM256-6', 'EMM2566']
|
||||
'ABC 123' -> ['ABC 123', 'ABC123']
|
||||
Delegates to CustomerNumberNormalizer.
|
||||
"""
|
||||
value = FieldNormalizer.clean_text(value)
|
||||
variants = [value]
|
||||
|
||||
# Version without spaces
|
||||
no_space = value.replace(' ', '')
|
||||
if no_space != value:
|
||||
variants.append(no_space)
|
||||
|
||||
# Version without dashes
|
||||
no_dash = value.replace('-', '')
|
||||
if no_dash != value:
|
||||
variants.append(no_dash)
|
||||
|
||||
# Version without spaces and dashes
|
||||
clean = value.replace(' ', '').replace('-', '')
|
||||
if clean != value and clean not in variants:
|
||||
variants.append(clean)
|
||||
|
||||
# Uppercase and lowercase versions
|
||||
if value.upper() != value:
|
||||
variants.append(value.upper())
|
||||
if value.lower() != value:
|
||||
variants.append(value.lower())
|
||||
|
||||
return list(set(v for v in variants if v))
|
||||
return FieldNormalizer._customer_number.normalize(value)
|
||||
|
||||
@staticmethod
|
||||
def normalize_amount(value: str) -> list[str]:
|
||||
"""
|
||||
Normalize monetary amount.
|
||||
|
||||
Examples:
|
||||
'114' -> ['114', '114,00', '114.00']
|
||||
'114,00' -> ['114,00', '114.00', '114']
|
||||
'1 234,56' -> ['1234,56', '1234.56', '1 234,56']
|
||||
'3045 52' -> ['3045.52', '3045,52', '304552'] (space as decimal sep)
|
||||
Delegates to AmountNormalizer.
|
||||
"""
|
||||
value = FieldNormalizer.clean_text(value)
|
||||
|
||||
# Remove currency symbols and common suffixes
|
||||
value = re.sub(r'[SEK|kr|:-]+$', '', value, flags=re.IGNORECASE).strip()
|
||||
|
||||
variants = [value]
|
||||
|
||||
# Check for space as decimal separator pattern: "3045 52" (number space 2-digits)
|
||||
# This is common in Swedish invoices where space separates öre from kronor
|
||||
space_decimal_match = re.match(r'^(\d+)\s+(\d{2})$', value)
|
||||
if space_decimal_match:
|
||||
integer_part = space_decimal_match.group(1)
|
||||
decimal_part = space_decimal_match.group(2)
|
||||
# Add variants with different decimal separators
|
||||
variants.append(f"{integer_part}.{decimal_part}")
|
||||
variants.append(f"{integer_part},{decimal_part}")
|
||||
variants.append(f"{integer_part}{decimal_part}") # No separator
|
||||
|
||||
# Check for space as thousand separator with decimal: "10 571,00" or "10 571.00"
|
||||
# Pattern: digits space digits comma/dot 2-digits
|
||||
space_thousand_match = re.match(r'^(\d{1,3})[\s\xa0]+(\d{3})([,\.])(\d{2})$', value)
|
||||
if space_thousand_match:
|
||||
part1 = space_thousand_match.group(1)
|
||||
part2 = space_thousand_match.group(2)
|
||||
sep = space_thousand_match.group(3)
|
||||
decimal = space_thousand_match.group(4)
|
||||
combined = f"{part1}{part2}"
|
||||
variants.append(f"{combined}.{decimal}")
|
||||
variants.append(f"{combined},{decimal}")
|
||||
variants.append(f"{combined}{decimal}")
|
||||
# Also add variant with space preserved but different decimal sep
|
||||
other_sep = ',' if sep == '.' else '.'
|
||||
variants.append(f"{part1} {part2}{other_sep}{decimal}")
|
||||
|
||||
# Handle US format: "1,390.00" (comma as thousand separator, dot as decimal)
|
||||
us_format_match = re.match(r'^(\d{1,3}),(\d{3})\.(\d{2})$', value)
|
||||
if us_format_match:
|
||||
part1 = us_format_match.group(1)
|
||||
part2 = us_format_match.group(2)
|
||||
decimal = us_format_match.group(3)
|
||||
combined = f"{part1}{part2}"
|
||||
variants.append(f"{combined}.{decimal}")
|
||||
variants.append(f"{combined},{decimal}")
|
||||
variants.append(combined) # Without decimal
|
||||
# European format: 1.390,00
|
||||
variants.append(f"{part1}.{part2},{decimal}")
|
||||
|
||||
# Handle European format: "1.390,00" (dot as thousand separator, comma as decimal)
|
||||
eu_format_match = re.match(r'^(\d{1,3})\.(\d{3}),(\d{2})$', value)
|
||||
if eu_format_match:
|
||||
part1 = eu_format_match.group(1)
|
||||
part2 = eu_format_match.group(2)
|
||||
decimal = eu_format_match.group(3)
|
||||
combined = f"{part1}{part2}"
|
||||
variants.append(f"{combined}.{decimal}")
|
||||
variants.append(f"{combined},{decimal}")
|
||||
variants.append(combined) # Without decimal
|
||||
# US format: 1,390.00
|
||||
variants.append(f"{part1},{part2}.{decimal}")
|
||||
|
||||
# Remove spaces (thousand separators) including non-breaking space
|
||||
no_space = value.replace(' ', '').replace('\xa0', '')
|
||||
|
||||
# Normalize decimal separator
|
||||
if ',' in no_space:
|
||||
dot_version = no_space.replace(',', '.')
|
||||
variants.append(no_space)
|
||||
variants.append(dot_version)
|
||||
elif '.' in no_space:
|
||||
comma_version = no_space.replace('.', ',')
|
||||
variants.append(no_space)
|
||||
variants.append(comma_version)
|
||||
else:
|
||||
# Integer amount - add decimal versions
|
||||
variants.append(no_space)
|
||||
variants.append(f"{no_space},00")
|
||||
variants.append(f"{no_space}.00")
|
||||
|
||||
# Try to parse and get clean numeric value
|
||||
try:
|
||||
# Parse as float
|
||||
clean = no_space.replace(',', '.')
|
||||
num = float(clean)
|
||||
|
||||
# Integer if no decimals
|
||||
if num == int(num):
|
||||
int_val = int(num)
|
||||
variants.append(str(int_val))
|
||||
variants.append(f"{int_val},00")
|
||||
variants.append(f"{int_val}.00")
|
||||
|
||||
# European format with dot as thousand separator (e.g., 20.485,00)
|
||||
if int_val >= 1000:
|
||||
# Format: XX.XXX,XX
|
||||
formatted = f"{int_val:,}".replace(',', '.')
|
||||
variants.append(formatted) # 20.485
|
||||
variants.append(f"{formatted},00") # 20.485,00
|
||||
else:
|
||||
variants.append(f"{num:.2f}")
|
||||
variants.append(f"{num:.2f}".replace('.', ','))
|
||||
|
||||
# European format with dot as thousand separator
|
||||
if num >= 1000:
|
||||
# Split integer and decimal parts using string formatting to avoid precision loss
|
||||
formatted_str = f"{num:.2f}"
|
||||
int_str, dec_str = formatted_str.split(".")
|
||||
int_part = int(int_str)
|
||||
formatted_int = f"{int_part:,}".replace(',', '.')
|
||||
variants.append(f"{formatted_int},{dec_str}") # 3.045,52
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return list(set(v for v in variants if v))
|
||||
return FieldNormalizer._amount.normalize(value)
|
||||
|
||||
@staticmethod
|
||||
def normalize_date(value: str) -> list[str]:
|
||||
"""
|
||||
Normalize date to YYYY-MM-DD and generate variants.
|
||||
|
||||
Handles:
|
||||
'2025-12-13' -> ['2025-12-13', '13/12/2025', '13.12.2025']
|
||||
'13/12/2025' -> ['2025-12-13', '13/12/2025', ...]
|
||||
'13 december 2025' -> ['2025-12-13', ...]
|
||||
|
||||
Note: For ambiguous formats like DD/MM/YYYY vs MM/DD/YYYY,
|
||||
we generate variants for BOTH interpretations to maximize matching.
|
||||
Delegates to DateNormalizer.
|
||||
"""
|
||||
value = FieldNormalizer.clean_text(value)
|
||||
variants = [value]
|
||||
|
||||
parsed_dates = [] # May have multiple interpretations
|
||||
|
||||
# Try different date formats
|
||||
date_patterns = [
|
||||
# ISO format with optional time (e.g., 2026-01-09 00:00:00)
|
||||
(r'^(\d{4})-(\d{1,2})-(\d{1,2})(?:\s+\d{1,2}:\d{2}:\d{2})?$', lambda m: (int(m[1]), int(m[2]), int(m[3]))),
|
||||
# Swedish format: YYMMDD
|
||||
(r'^(\d{2})(\d{2})(\d{2})$', lambda m: (2000 + int(m[1]) if int(m[1]) < 50 else 1900 + int(m[1]), int(m[2]), int(m[3]))),
|
||||
# Swedish format: YYYYMMDD
|
||||
(r'^(\d{4})(\d{2})(\d{2})$', lambda m: (int(m[1]), int(m[2]), int(m[3]))),
|
||||
]
|
||||
|
||||
# Ambiguous patterns - try both DD/MM and MM/DD interpretations
|
||||
ambiguous_patterns_4digit_year = [
|
||||
# Format with / - could be DD/MM/YYYY (European) or MM/DD/YYYY (US)
|
||||
r'^(\d{1,2})/(\d{1,2})/(\d{4})$',
|
||||
# Format with . - typically European DD.MM.YYYY
|
||||
r'^(\d{1,2})\.(\d{1,2})\.(\d{4})$',
|
||||
# Format with - (not ISO) - could be DD-MM-YYYY or MM-DD-YYYY
|
||||
r'^(\d{1,2})-(\d{1,2})-(\d{4})$',
|
||||
]
|
||||
|
||||
# Patterns with 2-digit year (common in Swedish invoices)
|
||||
ambiguous_patterns_2digit_year = [
|
||||
# Format DD.MM.YY (e.g., 02.08.25 for 2025-08-02)
|
||||
r'^(\d{1,2})\.(\d{1,2})\.(\d{2})$',
|
||||
# Format DD/MM/YY
|
||||
r'^(\d{1,2})/(\d{1,2})/(\d{2})$',
|
||||
# Format DD-MM-YY
|
||||
r'^(\d{1,2})-(\d{1,2})-(\d{2})$',
|
||||
]
|
||||
|
||||
# Try unambiguous patterns first
|
||||
for pattern, extractor in date_patterns:
|
||||
match = re.match(pattern, value)
|
||||
if match:
|
||||
try:
|
||||
year, month, day = extractor(match)
|
||||
parsed_dates.append(datetime(year, month, day))
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Try ambiguous patterns with 4-digit year
|
||||
if not parsed_dates:
|
||||
for pattern in ambiguous_patterns_4digit_year:
|
||||
match = re.match(pattern, value)
|
||||
if match:
|
||||
n1, n2, year = int(match[1]), int(match[2]), int(match[3])
|
||||
|
||||
# Try DD/MM/YYYY (European - day first)
|
||||
try:
|
||||
parsed_dates.append(datetime(year, n2, n1))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Try MM/DD/YYYY (US - month first) if different and valid
|
||||
if n1 != n2:
|
||||
try:
|
||||
parsed_dates.append(datetime(year, n1, n2))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if parsed_dates:
|
||||
break
|
||||
|
||||
# Try ambiguous patterns with 2-digit year (e.g., 02.08.25)
|
||||
if not parsed_dates:
|
||||
for pattern in ambiguous_patterns_2digit_year:
|
||||
match = re.match(pattern, value)
|
||||
if match:
|
||||
n1, n2, yy = int(match[1]), int(match[2]), int(match[3])
|
||||
# Convert 2-digit year to 4-digit (00-49 -> 2000s, 50-99 -> 1900s)
|
||||
year = 2000 + yy if yy < 50 else 1900 + yy
|
||||
|
||||
# Try DD/MM/YY (European - day first, most common in Sweden)
|
||||
try:
|
||||
parsed_dates.append(datetime(year, n2, n1))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Try MM/DD/YY (US - month first) if different and valid
|
||||
if n1 != n2:
|
||||
try:
|
||||
parsed_dates.append(datetime(year, n1, n2))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if parsed_dates:
|
||||
break
|
||||
|
||||
# Try Swedish month names
|
||||
if not parsed_dates:
|
||||
for month_name, month_num in FieldNormalizer.SWEDISH_MONTHS.items():
|
||||
if month_name in value.lower():
|
||||
# Extract day and year
|
||||
numbers = re.findall(r'\d+', value)
|
||||
if len(numbers) >= 2:
|
||||
day = int(numbers[0])
|
||||
year = int(numbers[-1])
|
||||
if year < 100:
|
||||
year = 2000 + year if year < 50 else 1900 + year
|
||||
try:
|
||||
parsed_dates.append(datetime(year, int(month_num), day))
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Generate variants for all parsed date interpretations
|
||||
swedish_months_full = [
|
||||
'januari', 'februari', 'mars', 'april', 'maj', 'juni',
|
||||
'juli', 'augusti', 'september', 'oktober', 'november', 'december'
|
||||
]
|
||||
swedish_months_abbrev = [
|
||||
'jan', 'feb', 'mar', 'apr', 'maj', 'jun',
|
||||
'jul', 'aug', 'sep', 'okt', 'nov', 'dec'
|
||||
]
|
||||
|
||||
for parsed_date in parsed_dates:
|
||||
# Generate different formats
|
||||
iso = parsed_date.strftime('%Y-%m-%d')
|
||||
eu_slash = parsed_date.strftime('%d/%m/%Y')
|
||||
us_slash = parsed_date.strftime('%m/%d/%Y') # US format MM/DD/YYYY
|
||||
eu_dot = parsed_date.strftime('%d.%m.%Y')
|
||||
iso_dot = parsed_date.strftime('%Y.%m.%d') # ISO with dots (e.g., 2024.02.08)
|
||||
compact = parsed_date.strftime('%Y%m%d') # YYYYMMDD
|
||||
compact_short = parsed_date.strftime('%y%m%d') # YYMMDD (e.g., 260108)
|
||||
|
||||
# Short year with dot separator (e.g., 02.01.26)
|
||||
eu_dot_short = parsed_date.strftime('%d.%m.%y')
|
||||
|
||||
# Short year with slash separator (e.g., 20/10/24) - DD/MM/YY format
|
||||
eu_slash_short = parsed_date.strftime('%d/%m/%y')
|
||||
|
||||
# Short year with hyphen separator (e.g., 23-11-01) - common in Swedish invoices
|
||||
yy_mm_dd_short = parsed_date.strftime('%y-%m-%d')
|
||||
|
||||
# Middle dot separator (OCR sometimes reads hyphens as middle dots)
|
||||
iso_middot = parsed_date.strftime('%Y·%m·%d')
|
||||
|
||||
# Spaced formats (e.g., "2026 01 12", "26 01 12")
|
||||
spaced_full = parsed_date.strftime('%Y %m %d')
|
||||
spaced_short = parsed_date.strftime('%y %m %d')
|
||||
|
||||
# Swedish month name formats (e.g., "9 januari 2026", "9 jan 2026")
|
||||
month_full = swedish_months_full[parsed_date.month - 1]
|
||||
month_abbrev = swedish_months_abbrev[parsed_date.month - 1]
|
||||
swedish_format_full = f"{parsed_date.day} {month_full} {parsed_date.year}"
|
||||
swedish_format_abbrev = f"{parsed_date.day} {month_abbrev} {parsed_date.year}"
|
||||
|
||||
# Swedish month abbreviation with hyphen (e.g., "30-OKT-24", "30-okt-24")
|
||||
month_abbrev_upper = month_abbrev.upper()
|
||||
swedish_hyphen_short = f"{parsed_date.day:02d}-{month_abbrev_upper}-{parsed_date.strftime('%y')}"
|
||||
swedish_hyphen_short_lower = f"{parsed_date.day:02d}-{month_abbrev}-{parsed_date.strftime('%y')}"
|
||||
# Also without leading zero on day
|
||||
swedish_hyphen_short_no_zero = f"{parsed_date.day}-{month_abbrev_upper}-{parsed_date.strftime('%y')}"
|
||||
|
||||
# Swedish month abbreviation with short year in different format (e.g., "SEP-24", "30 SEP 24")
|
||||
month_year_only = f"{month_abbrev_upper}-{parsed_date.strftime('%y')}"
|
||||
swedish_spaced = f"{parsed_date.day:02d} {month_abbrev_upper} {parsed_date.strftime('%y')}"
|
||||
|
||||
variants.extend([
|
||||
iso, eu_slash, us_slash, eu_dot, iso_dot, compact, compact_short,
|
||||
eu_dot_short, eu_slash_short, yy_mm_dd_short, iso_middot, spaced_full, spaced_short,
|
||||
swedish_format_full, swedish_format_abbrev,
|
||||
swedish_hyphen_short, swedish_hyphen_short_lower, swedish_hyphen_short_no_zero,
|
||||
month_year_only, swedish_spaced
|
||||
])
|
||||
|
||||
return list(set(v for v in variants if v))
|
||||
return FieldNormalizer._date.normalize(value)
|
||||
|
||||
|
||||
# Field type to normalizer mapping
|
||||
|
||||
Reference in New Issue
Block a user