Initial commit: Invoice field extraction system using YOLO + OCR

Features:
- Auto-labeling pipeline: CSV values -> PDF search -> YOLO annotations
- Flexible date matching: year-month match, nearby date tolerance
- PDF text extraction with PyMuPDF
- OCR support for scanned documents (PaddleOCR)
- YOLO training and inference pipeline
- 7 field types: InvoiceNumber, InvoiceDate, InvoiceDueDate, OCR, Bankgiro, Plusgiro, Amount

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Yaojia Wang
2026-01-10 17:44:14 +01:00
commit 8938661850
35 changed files with 5020 additions and 0 deletions

290
src/normalize/normalizer.py Normal file
View File

@@ -0,0 +1,290 @@
"""
Field Normalization Module
Normalizes field values to generate multiple candidate forms for matching.
"""
import re
from dataclasses import dataclass
from datetime import datetime
from typing import Callable
@dataclass
class NormalizedValue:
"""Represents a normalized value with its variants."""
original: str
variants: list[str]
field_type: str
class FieldNormalizer:
"""Handles normalization of different invoice field types."""
# Common Swedish month names for date parsing
SWEDISH_MONTHS = {
'januari': '01', 'jan': '01',
'februari': '02', 'feb': '02',
'mars': '03', 'mar': '03',
'april': '04', 'apr': '04',
'maj': '05',
'juni': '06', 'jun': '06',
'juli': '07', 'jul': '07',
'augusti': '08', 'aug': '08',
'september': '09', 'sep': '09', 'sept': '09',
'oktober': '10', 'okt': '10',
'november': '11', 'nov': '11',
'december': '12', 'dec': '12'
}
@staticmethod
def clean_text(text: str) -> str:
"""Remove invisible characters and normalize whitespace."""
# Remove zero-width characters
text = re.sub(r'[\u200b\u200c\u200d\ufeff]', '', text)
# Normalize whitespace
text = ' '.join(text.split())
return text.strip()
@staticmethod
def normalize_invoice_number(value: str) -> list[str]:
"""
Normalize invoice number.
Keeps only digits for matching.
Examples:
'100017500321' -> ['100017500321']
'INV-100017500321' -> ['100017500321', 'INV-100017500321']
"""
value = FieldNormalizer.clean_text(value)
digits_only = re.sub(r'\D', '', value)
variants = [value]
if digits_only and digits_only != value:
variants.append(digits_only)
return list(set(v for v in variants if v))
@staticmethod
def normalize_ocr_number(value: str) -> list[str]:
"""
Normalize OCR number (Swedish payment reference).
Similar to invoice number - digits only.
"""
return FieldNormalizer.normalize_invoice_number(value)
@staticmethod
def normalize_bankgiro(value: str) -> list[str]:
"""
Normalize Bankgiro number.
Examples:
'5393-9484' -> ['5393-9484', '53939484']
'53939484' -> ['53939484', '5393-9484']
"""
value = FieldNormalizer.clean_text(value)
digits_only = re.sub(r'\D', '', value)
variants = [value]
if digits_only:
# Add without dash
variants.append(digits_only)
# Add with dash (format: XXXX-XXXX for 8 digits)
if len(digits_only) == 8:
with_dash = f"{digits_only[:4]}-{digits_only[4:]}"
variants.append(with_dash)
elif len(digits_only) == 7:
# Some bankgiro numbers are 7 digits: XXX-XXXX
with_dash = f"{digits_only[:3]}-{digits_only[3:]}"
variants.append(with_dash)
return list(set(v for v in variants if v))
@staticmethod
def normalize_plusgiro(value: str) -> list[str]:
"""
Normalize Plusgiro number.
Examples:
'1234567-8' -> ['1234567-8', '12345678']
'12345678' -> ['12345678', '1234567-8']
"""
value = FieldNormalizer.clean_text(value)
digits_only = re.sub(r'\D', '', value)
variants = [value]
if digits_only:
variants.append(digits_only)
# Plusgiro format: XXXXXXX-X (7 digits + check digit)
if len(digits_only) == 8:
with_dash = f"{digits_only[:-1]}-{digits_only[-1]}"
variants.append(with_dash)
# Also handle 6+1 format
elif len(digits_only) == 7:
with_dash = f"{digits_only[:-1]}-{digits_only[-1]}"
variants.append(with_dash)
return list(set(v for v in variants if v))
@staticmethod
def normalize_amount(value: str) -> list[str]:
"""
Normalize monetary amount.
Examples:
'114' -> ['114', '114,00', '114.00']
'114,00' -> ['114,00', '114.00', '114']
'1 234,56' -> ['1234,56', '1234.56', '1 234,56']
"""
value = FieldNormalizer.clean_text(value)
# Remove currency symbols and common suffixes
value = re.sub(r'[SEK|kr|:-]+$', '', value, flags=re.IGNORECASE).strip()
# Remove spaces (thousand separators)
no_space = value.replace(' ', '').replace('\xa0', '')
variants = [value]
# Normalize decimal separator
if ',' in no_space:
dot_version = no_space.replace(',', '.')
variants.append(no_space)
variants.append(dot_version)
elif '.' in no_space:
comma_version = no_space.replace('.', ',')
variants.append(no_space)
variants.append(comma_version)
else:
# Integer amount - add decimal versions
variants.append(no_space)
variants.append(f"{no_space},00")
variants.append(f"{no_space}.00")
# Try to parse and get clean numeric value
try:
# Parse as float
clean = no_space.replace(',', '.')
num = float(clean)
# Integer if no decimals
if num == int(num):
variants.append(str(int(num)))
variants.append(f"{int(num)},00")
variants.append(f"{int(num)}.00")
else:
variants.append(f"{num:.2f}")
variants.append(f"{num:.2f}".replace('.', ','))
except ValueError:
pass
return list(set(v for v in variants if v))
@staticmethod
def normalize_date(value: str) -> list[str]:
"""
Normalize date to YYYY-MM-DD and generate variants.
Handles:
'2025-12-13' -> ['2025-12-13', '13/12/2025', '13.12.2025']
'13/12/2025' -> ['2025-12-13', '13/12/2025', ...]
'13 december 2025' -> ['2025-12-13', ...]
"""
value = FieldNormalizer.clean_text(value)
variants = [value]
parsed_date = None
# Try different date formats
date_patterns = [
# ISO format with optional time (e.g., 2026-01-09 00:00:00)
(r'^(\d{4})-(\d{1,2})-(\d{1,2})(?:\s+\d{1,2}:\d{2}:\d{2})?$', lambda m: (int(m[1]), int(m[2]), int(m[3]))),
# European format with /
(r'^(\d{1,2})/(\d{1,2})/(\d{4})$', lambda m: (int(m[3]), int(m[2]), int(m[1]))),
# European format with .
(r'^(\d{1,2})\.(\d{1,2})\.(\d{4})$', lambda m: (int(m[3]), int(m[2]), int(m[1]))),
# European format with -
(r'^(\d{1,2})-(\d{1,2})-(\d{4})$', lambda m: (int(m[3]), int(m[2]), int(m[1]))),
# Swedish format: YYMMDD
(r'^(\d{2})(\d{2})(\d{2})$', lambda m: (2000 + int(m[1]) if int(m[1]) < 50 else 1900 + int(m[1]), int(m[2]), int(m[3]))),
# Swedish format: YYYYMMDD
(r'^(\d{4})(\d{2})(\d{2})$', lambda m: (int(m[1]), int(m[2]), int(m[3]))),
]
for pattern, extractor in date_patterns:
match = re.match(pattern, value)
if match:
try:
year, month, day = extractor(match)
parsed_date = datetime(year, month, day)
break
except ValueError:
continue
# Try Swedish month names
if not parsed_date:
for month_name, month_num in FieldNormalizer.SWEDISH_MONTHS.items():
if month_name in value.lower():
# Extract day and year
numbers = re.findall(r'\d+', value)
if len(numbers) >= 2:
day = int(numbers[0])
year = int(numbers[-1])
if year < 100:
year = 2000 + year if year < 50 else 1900 + year
try:
parsed_date = datetime(year, int(month_num), day)
break
except ValueError:
continue
if parsed_date:
# Generate different formats
iso = parsed_date.strftime('%Y-%m-%d')
eu_slash = parsed_date.strftime('%d/%m/%Y')
eu_dot = parsed_date.strftime('%d.%m.%Y')
compact = parsed_date.strftime('%Y%m%d')
variants.extend([iso, eu_slash, eu_dot, compact])
return list(set(v for v in variants if v))
# Field type to normalizer mapping
NORMALIZERS: dict[str, Callable[[str], list[str]]] = {
'InvoiceNumber': FieldNormalizer.normalize_invoice_number,
'OCR': FieldNormalizer.normalize_ocr_number,
'Bankgiro': FieldNormalizer.normalize_bankgiro,
'Plusgiro': FieldNormalizer.normalize_plusgiro,
'Amount': FieldNormalizer.normalize_amount,
'InvoiceDate': FieldNormalizer.normalize_date,
'InvoiceDueDate': FieldNormalizer.normalize_date,
}
def normalize_field(field_name: str, value: str) -> list[str]:
"""
Normalize a field value based on its type.
Args:
field_name: Name of the field (e.g., 'InvoiceNumber', 'Amount')
value: Raw value to normalize
Returns:
List of normalized variants
"""
if value is None or (isinstance(value, str) and not value.strip()):
return []
value = str(value)
normalizer = NORMALIZERS.get(field_name)
if normalizer:
return normalizer(value)
# Default: just clean the text
return [FieldNormalizer.clean_text(value)]