96 lines
3.4 KiB
Python
96 lines
3.4 KiB
Python
"""
|
|
Invoice Number Normalizer
|
|
|
|
Handles normalization and validation of invoice numbers.
|
|
"""
|
|
|
|
import re
|
|
|
|
from .base import BaseNormalizer, NormalizationResult
|
|
|
|
|
|
class InvoiceNumberNormalizer(BaseNormalizer):
|
|
"""
|
|
Normalizes invoice numbers from Swedish invoices.
|
|
|
|
Invoice numbers can be:
|
|
- Pure digits: 12345678
|
|
- Alphanumeric: A3861, INV-2024-001, F12345
|
|
- With separators: 2024/001, 2024-001
|
|
|
|
Strategy:
|
|
1. Look for common invoice number patterns
|
|
2. Prefer shorter, more specific matches over long digit sequences
|
|
"""
|
|
|
|
@property
|
|
def field_name(self) -> str:
|
|
return "InvoiceNumber"
|
|
|
|
def normalize(self, text: str) -> NormalizationResult:
|
|
text = text.strip()
|
|
if not text:
|
|
return NormalizationResult.failure("Empty text")
|
|
|
|
# Pattern 1: Alphanumeric invoice number (letter + digits or digits + letter)
|
|
# Examples: A3861, F12345, INV001
|
|
alpha_patterns = [
|
|
r"\b([A-Z]{1,3}\d{3,10})\b", # A3861, INV12345
|
|
r"\b(\d{3,10}[A-Z]{1,3})\b", # 12345A
|
|
r"\b([A-Z]{2,5}[-/]?\d{3,10})\b", # INV-12345, FAK12345
|
|
]
|
|
|
|
for pattern in alpha_patterns:
|
|
match = re.search(pattern, text, re.IGNORECASE)
|
|
if match:
|
|
return NormalizationResult.success(match.group(1).upper())
|
|
|
|
# Pattern 2: Invoice number with year prefix (2024-001, 2024/12345)
|
|
year_pattern = r"\b(20\d{2}[-/]\d{3,8})\b"
|
|
match = re.search(year_pattern, text)
|
|
if match:
|
|
return NormalizationResult.success(match.group(1))
|
|
|
|
# Pattern 3: Short digit sequence (3-10 digits) - prefer shorter sequences
|
|
# This avoids capturing long OCR numbers
|
|
digit_sequences = re.findall(r"\b(\d{3,10})\b", text)
|
|
if digit_sequences:
|
|
# Prefer shorter sequences (more likely to be invoice number)
|
|
# Also filter out sequences that look like dates (8 digits starting with 20)
|
|
valid_sequences = []
|
|
for seq in digit_sequences:
|
|
# Skip if it looks like a date (YYYYMMDD)
|
|
if len(seq) == 8 and seq.startswith("20"):
|
|
continue
|
|
# Skip year-only values (2024, 2025, 2026, etc.)
|
|
if len(seq) == 4 and seq.startswith("20"):
|
|
continue
|
|
# Skip if too long (likely OCR number)
|
|
if len(seq) > 10:
|
|
continue
|
|
valid_sequences.append(seq)
|
|
|
|
if valid_sequences:
|
|
# Prefer 4-8 digit sequences (typical invoice numbers),
|
|
# then closest to 6 digits within that range.
|
|
# This avoids picking short fragments like "775" from amounts.
|
|
def _score(seq: str) -> tuple[int, int]:
|
|
length = len(seq)
|
|
if 4 <= length <= 8:
|
|
return (1, -abs(length - 6))
|
|
return (0, -length)
|
|
|
|
return NormalizationResult.success(max(valid_sequences, key=_score))
|
|
|
|
# Fallback: extract all digits if nothing else works
|
|
digits = re.sub(r"\D", "", text)
|
|
if len(digits) >= 3:
|
|
# Limit to first 15 digits to avoid very long sequences
|
|
return NormalizationResult.success_with_warning(
|
|
digits[:15], "Fallback extraction"
|
|
)
|
|
|
|
return NormalizationResult.failure(
|
|
f"Cannot extract invoice number from: {text[:50]}"
|
|
)
|