Initial commit: Invoice field extraction system using YOLO + OCR
Features: - Auto-labeling pipeline: CSV values -> PDF search -> YOLO annotations - Flexible date matching: year-month match, nearby date tolerance - PDF text extraction with PyMuPDF - OCR support for scanned documents (PaddleOCR) - YOLO training and inference pipeline - 7 field types: InvoiceNumber, InvoiceDate, InvoiceDueDate, OCR, Bankgiro, Plusgiro, Amount Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
290
src/normalize/normalizer.py
Normal file
290
src/normalize/normalizer.py
Normal file
@@ -0,0 +1,290 @@
|
||||
"""
|
||||
Field Normalization Module
|
||||
|
||||
Normalizes field values to generate multiple candidate forms for matching.
|
||||
"""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import Callable
|
||||
|
||||
|
||||
@dataclass
|
||||
class NormalizedValue:
|
||||
"""Represents a normalized value with its variants."""
|
||||
original: str
|
||||
variants: list[str]
|
||||
field_type: str
|
||||
|
||||
|
||||
class FieldNormalizer:
|
||||
"""Handles normalization of different invoice field types."""
|
||||
|
||||
# Common Swedish month names for date parsing
|
||||
SWEDISH_MONTHS = {
|
||||
'januari': '01', 'jan': '01',
|
||||
'februari': '02', 'feb': '02',
|
||||
'mars': '03', 'mar': '03',
|
||||
'april': '04', 'apr': '04',
|
||||
'maj': '05',
|
||||
'juni': '06', 'jun': '06',
|
||||
'juli': '07', 'jul': '07',
|
||||
'augusti': '08', 'aug': '08',
|
||||
'september': '09', 'sep': '09', 'sept': '09',
|
||||
'oktober': '10', 'okt': '10',
|
||||
'november': '11', 'nov': '11',
|
||||
'december': '12', 'dec': '12'
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def clean_text(text: str) -> str:
|
||||
"""Remove invisible characters and normalize whitespace."""
|
||||
# Remove zero-width characters
|
||||
text = re.sub(r'[\u200b\u200c\u200d\ufeff]', '', text)
|
||||
# Normalize whitespace
|
||||
text = ' '.join(text.split())
|
||||
return text.strip()
|
||||
|
||||
@staticmethod
|
||||
def normalize_invoice_number(value: str) -> list[str]:
|
||||
"""
|
||||
Normalize invoice number.
|
||||
Keeps only digits for matching.
|
||||
|
||||
Examples:
|
||||
'100017500321' -> ['100017500321']
|
||||
'INV-100017500321' -> ['100017500321', 'INV-100017500321']
|
||||
"""
|
||||
value = FieldNormalizer.clean_text(value)
|
||||
digits_only = re.sub(r'\D', '', value)
|
||||
|
||||
variants = [value]
|
||||
if digits_only and digits_only != value:
|
||||
variants.append(digits_only)
|
||||
|
||||
return list(set(v for v in variants if v))
|
||||
|
||||
@staticmethod
|
||||
def normalize_ocr_number(value: str) -> list[str]:
|
||||
"""
|
||||
Normalize OCR number (Swedish payment reference).
|
||||
Similar to invoice number - digits only.
|
||||
"""
|
||||
return FieldNormalizer.normalize_invoice_number(value)
|
||||
|
||||
@staticmethod
|
||||
def normalize_bankgiro(value: str) -> list[str]:
|
||||
"""
|
||||
Normalize Bankgiro number.
|
||||
|
||||
Examples:
|
||||
'5393-9484' -> ['5393-9484', '53939484']
|
||||
'53939484' -> ['53939484', '5393-9484']
|
||||
"""
|
||||
value = FieldNormalizer.clean_text(value)
|
||||
digits_only = re.sub(r'\D', '', value)
|
||||
|
||||
variants = [value]
|
||||
|
||||
if digits_only:
|
||||
# Add without dash
|
||||
variants.append(digits_only)
|
||||
|
||||
# Add with dash (format: XXXX-XXXX for 8 digits)
|
||||
if len(digits_only) == 8:
|
||||
with_dash = f"{digits_only[:4]}-{digits_only[4:]}"
|
||||
variants.append(with_dash)
|
||||
elif len(digits_only) == 7:
|
||||
# Some bankgiro numbers are 7 digits: XXX-XXXX
|
||||
with_dash = f"{digits_only[:3]}-{digits_only[3:]}"
|
||||
variants.append(with_dash)
|
||||
|
||||
return list(set(v for v in variants if v))
|
||||
|
||||
@staticmethod
|
||||
def normalize_plusgiro(value: str) -> list[str]:
|
||||
"""
|
||||
Normalize Plusgiro number.
|
||||
|
||||
Examples:
|
||||
'1234567-8' -> ['1234567-8', '12345678']
|
||||
'12345678' -> ['12345678', '1234567-8']
|
||||
"""
|
||||
value = FieldNormalizer.clean_text(value)
|
||||
digits_only = re.sub(r'\D', '', value)
|
||||
|
||||
variants = [value]
|
||||
|
||||
if digits_only:
|
||||
variants.append(digits_only)
|
||||
|
||||
# Plusgiro format: XXXXXXX-X (7 digits + check digit)
|
||||
if len(digits_only) == 8:
|
||||
with_dash = f"{digits_only[:-1]}-{digits_only[-1]}"
|
||||
variants.append(with_dash)
|
||||
# Also handle 6+1 format
|
||||
elif len(digits_only) == 7:
|
||||
with_dash = f"{digits_only[:-1]}-{digits_only[-1]}"
|
||||
variants.append(with_dash)
|
||||
|
||||
return list(set(v for v in variants if v))
|
||||
|
||||
@staticmethod
|
||||
def normalize_amount(value: str) -> list[str]:
|
||||
"""
|
||||
Normalize monetary amount.
|
||||
|
||||
Examples:
|
||||
'114' -> ['114', '114,00', '114.00']
|
||||
'114,00' -> ['114,00', '114.00', '114']
|
||||
'1 234,56' -> ['1234,56', '1234.56', '1 234,56']
|
||||
"""
|
||||
value = FieldNormalizer.clean_text(value)
|
||||
|
||||
# Remove currency symbols and common suffixes
|
||||
value = re.sub(r'[SEK|kr|:-]+$', '', value, flags=re.IGNORECASE).strip()
|
||||
|
||||
# Remove spaces (thousand separators)
|
||||
no_space = value.replace(' ', '').replace('\xa0', '')
|
||||
|
||||
variants = [value]
|
||||
|
||||
# Normalize decimal separator
|
||||
if ',' in no_space:
|
||||
dot_version = no_space.replace(',', '.')
|
||||
variants.append(no_space)
|
||||
variants.append(dot_version)
|
||||
elif '.' in no_space:
|
||||
comma_version = no_space.replace('.', ',')
|
||||
variants.append(no_space)
|
||||
variants.append(comma_version)
|
||||
else:
|
||||
# Integer amount - add decimal versions
|
||||
variants.append(no_space)
|
||||
variants.append(f"{no_space},00")
|
||||
variants.append(f"{no_space}.00")
|
||||
|
||||
# Try to parse and get clean numeric value
|
||||
try:
|
||||
# Parse as float
|
||||
clean = no_space.replace(',', '.')
|
||||
num = float(clean)
|
||||
|
||||
# Integer if no decimals
|
||||
if num == int(num):
|
||||
variants.append(str(int(num)))
|
||||
variants.append(f"{int(num)},00")
|
||||
variants.append(f"{int(num)}.00")
|
||||
else:
|
||||
variants.append(f"{num:.2f}")
|
||||
variants.append(f"{num:.2f}".replace('.', ','))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return list(set(v for v in variants if v))
|
||||
|
||||
@staticmethod
|
||||
def normalize_date(value: str) -> list[str]:
|
||||
"""
|
||||
Normalize date to YYYY-MM-DD and generate variants.
|
||||
|
||||
Handles:
|
||||
'2025-12-13' -> ['2025-12-13', '13/12/2025', '13.12.2025']
|
||||
'13/12/2025' -> ['2025-12-13', '13/12/2025', ...]
|
||||
'13 december 2025' -> ['2025-12-13', ...]
|
||||
"""
|
||||
value = FieldNormalizer.clean_text(value)
|
||||
variants = [value]
|
||||
|
||||
parsed_date = None
|
||||
|
||||
# Try different date formats
|
||||
date_patterns = [
|
||||
# ISO format with optional time (e.g., 2026-01-09 00:00:00)
|
||||
(r'^(\d{4})-(\d{1,2})-(\d{1,2})(?:\s+\d{1,2}:\d{2}:\d{2})?$', lambda m: (int(m[1]), int(m[2]), int(m[3]))),
|
||||
# European format with /
|
||||
(r'^(\d{1,2})/(\d{1,2})/(\d{4})$', lambda m: (int(m[3]), int(m[2]), int(m[1]))),
|
||||
# European format with .
|
||||
(r'^(\d{1,2})\.(\d{1,2})\.(\d{4})$', lambda m: (int(m[3]), int(m[2]), int(m[1]))),
|
||||
# European format with -
|
||||
(r'^(\d{1,2})-(\d{1,2})-(\d{4})$', lambda m: (int(m[3]), int(m[2]), int(m[1]))),
|
||||
# Swedish format: YYMMDD
|
||||
(r'^(\d{2})(\d{2})(\d{2})$', lambda m: (2000 + int(m[1]) if int(m[1]) < 50 else 1900 + int(m[1]), int(m[2]), int(m[3]))),
|
||||
# Swedish format: YYYYMMDD
|
||||
(r'^(\d{4})(\d{2})(\d{2})$', lambda m: (int(m[1]), int(m[2]), int(m[3]))),
|
||||
]
|
||||
|
||||
for pattern, extractor in date_patterns:
|
||||
match = re.match(pattern, value)
|
||||
if match:
|
||||
try:
|
||||
year, month, day = extractor(match)
|
||||
parsed_date = datetime(year, month, day)
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Try Swedish month names
|
||||
if not parsed_date:
|
||||
for month_name, month_num in FieldNormalizer.SWEDISH_MONTHS.items():
|
||||
if month_name in value.lower():
|
||||
# Extract day and year
|
||||
numbers = re.findall(r'\d+', value)
|
||||
if len(numbers) >= 2:
|
||||
day = int(numbers[0])
|
||||
year = int(numbers[-1])
|
||||
if year < 100:
|
||||
year = 2000 + year if year < 50 else 1900 + year
|
||||
try:
|
||||
parsed_date = datetime(year, int(month_num), day)
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
if parsed_date:
|
||||
# Generate different formats
|
||||
iso = parsed_date.strftime('%Y-%m-%d')
|
||||
eu_slash = parsed_date.strftime('%d/%m/%Y')
|
||||
eu_dot = parsed_date.strftime('%d.%m.%Y')
|
||||
compact = parsed_date.strftime('%Y%m%d')
|
||||
|
||||
variants.extend([iso, eu_slash, eu_dot, compact])
|
||||
|
||||
return list(set(v for v in variants if v))
|
||||
|
||||
|
||||
# Field type to normalizer mapping
|
||||
NORMALIZERS: dict[str, Callable[[str], list[str]]] = {
|
||||
'InvoiceNumber': FieldNormalizer.normalize_invoice_number,
|
||||
'OCR': FieldNormalizer.normalize_ocr_number,
|
||||
'Bankgiro': FieldNormalizer.normalize_bankgiro,
|
||||
'Plusgiro': FieldNormalizer.normalize_plusgiro,
|
||||
'Amount': FieldNormalizer.normalize_amount,
|
||||
'InvoiceDate': FieldNormalizer.normalize_date,
|
||||
'InvoiceDueDate': FieldNormalizer.normalize_date,
|
||||
}
|
||||
|
||||
|
||||
def normalize_field(field_name: str, value: str) -> list[str]:
|
||||
"""
|
||||
Normalize a field value based on its type.
|
||||
|
||||
Args:
|
||||
field_name: Name of the field (e.g., 'InvoiceNumber', 'Amount')
|
||||
value: Raw value to normalize
|
||||
|
||||
Returns:
|
||||
List of normalized variants
|
||||
"""
|
||||
if value is None or (isinstance(value, str) and not value.strip()):
|
||||
return []
|
||||
|
||||
value = str(value)
|
||||
normalizer = NORMALIZERS.get(field_name)
|
||||
|
||||
if normalizer:
|
||||
return normalizer(value)
|
||||
|
||||
# Default: just clean the text
|
||||
return [FieldNormalizer.clean_text(value)]
|
||||
Reference in New Issue
Block a user