Files
invoice-master-poc-v2/packages/backend/backend/pipeline/normalizers/invoice_number.py
Yaojia Wang 58d36c8927 WIP
2026-02-12 23:06:00 +01:00

96 lines
3.4 KiB
Python

"""
Invoice Number Normalizer
Handles normalization and validation of invoice numbers.
"""
import re
from .base import BaseNormalizer, NormalizationResult
class InvoiceNumberNormalizer(BaseNormalizer):
"""
Normalizes invoice numbers from Swedish invoices.
Invoice numbers can be:
- Pure digits: 12345678
- Alphanumeric: A3861, INV-2024-001, F12345
- With separators: 2024/001, 2024-001
Strategy:
1. Look for common invoice number patterns
2. Prefer shorter, more specific matches over long digit sequences
"""
@property
def field_name(self) -> str:
return "InvoiceNumber"
def normalize(self, text: str) -> NormalizationResult:
text = text.strip()
if not text:
return NormalizationResult.failure("Empty text")
# Pattern 1: Alphanumeric invoice number (letter + digits or digits + letter)
# Examples: A3861, F12345, INV001
alpha_patterns = [
r"\b([A-Z]{1,3}\d{3,10})\b", # A3861, INV12345
r"\b(\d{3,10}[A-Z]{1,3})\b", # 12345A
r"\b([A-Z]{2,5}[-/]?\d{3,10})\b", # INV-12345, FAK12345
]
for pattern in alpha_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
return NormalizationResult.success(match.group(1).upper())
# Pattern 2: Invoice number with year prefix (2024-001, 2024/12345)
year_pattern = r"\b(20\d{2}[-/]\d{3,8})\b"
match = re.search(year_pattern, text)
if match:
return NormalizationResult.success(match.group(1))
# Pattern 3: Short digit sequence (3-10 digits) - prefer shorter sequences
# This avoids capturing long OCR numbers
digit_sequences = re.findall(r"\b(\d{3,10})\b", text)
if digit_sequences:
# Prefer shorter sequences (more likely to be invoice number)
# Also filter out sequences that look like dates (8 digits starting with 20)
valid_sequences = []
for seq in digit_sequences:
# Skip if it looks like a date (YYYYMMDD)
if len(seq) == 8 and seq.startswith("20"):
continue
# Skip year-only values (2024, 2025, 2026, etc.)
if len(seq) == 4 and seq.startswith("20"):
continue
# Skip if too long (likely OCR number)
if len(seq) > 10:
continue
valid_sequences.append(seq)
if valid_sequences:
# Prefer 4-8 digit sequences (typical invoice numbers),
# then closest to 6 digits within that range.
# This avoids picking short fragments like "775" from amounts.
def _score(seq: str) -> tuple[int, int]:
length = len(seq)
if 4 <= length <= 8:
return (1, -abs(length - 6))
return (0, -length)
return NormalizationResult.success(max(valid_sequences, key=_score))
# Fallback: extract all digits if nothing else works
digits = re.sub(r"\D", "", text)
if len(digits) >= 3:
# Limit to first 15 digits to avoid very long sequences
return NormalizationResult.success_with_warning(
digits[:15], "Fallback extraction"
)
return NormalizationResult.failure(
f"Cannot extract invoice number from: {text[:50]}"
)