Add payment line parser and fix OCR override from payment_line

- Add MachineCodeParser for Swedish invoice payment line parsing
- Fix OCR Reference extraction by normalizing account number spaces
- Add cross-validation tests for pipeline and field_extractor
- Update UI layout for compact upload and full-width results

Key changes:
- machine_code_parser.py: Handle spaces in Bankgiro numbers (e.g. "78 2 1 713")
- pipeline.py: OCR and Amount override from payment_line, BG/PG comparison only
- field_extractor.py: Improved invoice number normalization
- app.py: Responsive UI layout changes

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Yaojia Wang
2026-01-21 21:47:02 +01:00
parent e9460e9f34
commit 4ea4bc96d4
33 changed files with 7530 additions and 562 deletions

View File

@@ -238,18 +238,77 @@ class FieldExtractor:
elif field_name in ('InvoiceDate', 'InvoiceDueDate'):
return self._normalize_date(text)
elif field_name == 'payment_line':
return self._normalize_payment_line(text)
elif field_name == 'supplier_org_number':
return self._normalize_supplier_org_number(text)
elif field_name == 'customer_number':
return self._normalize_customer_number(text)
else:
return text, True, None
def _normalize_invoice_number(self, text: str) -> tuple[str | None, bool, str | None]:
"""Normalize invoice number."""
# Extract digits only
"""
Normalize invoice number.
Invoice numbers can be:
- Pure digits: 12345678
- Alphanumeric: A3861, INV-2024-001, F12345
- With separators: 2024/001, 2024-001
Strategy:
1. Look for common invoice number patterns
2. Prefer shorter, more specific matches over long digit sequences
"""
# Pattern 1: Alphanumeric invoice number (letter + digits or digits + letter)
# Examples: A3861, F12345, INV001
alpha_patterns = [
r'\b([A-Z]{1,3}\d{3,10})\b', # A3861, INV12345
r'\b(\d{3,10}[A-Z]{1,3})\b', # 12345A
r'\b([A-Z]{2,5}[-/]?\d{3,10})\b', # INV-12345, FAK12345
]
for pattern in alpha_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
return match.group(1).upper(), True, None
# Pattern 2: Invoice number with year prefix (2024-001, 2024/12345)
year_pattern = r'\b(20\d{2}[-/]\d{3,8})\b'
match = re.search(year_pattern, text)
if match:
return match.group(1), True, None
# Pattern 3: Short digit sequence (3-10 digits) - prefer shorter sequences
# This avoids capturing long OCR numbers
digit_sequences = re.findall(r'\b(\d{3,10})\b', text)
if digit_sequences:
# Prefer shorter sequences (more likely to be invoice number)
# Also filter out sequences that look like dates (8 digits starting with 20)
valid_sequences = []
for seq in digit_sequences:
# Skip if it looks like a date (YYYYMMDD)
if len(seq) == 8 and seq.startswith('20'):
continue
# Skip if too long (likely OCR number)
if len(seq) > 10:
continue
valid_sequences.append(seq)
if valid_sequences:
# Return shortest valid sequence
return min(valid_sequences, key=len), True, None
# Fallback: extract all digits if nothing else works
digits = re.sub(r'\D', '', text)
if len(digits) >= 3:
# Limit to first 15 digits to avoid very long sequences
return digits[:15], True, "Fallback extraction"
if len(digits) < 3:
return None, False, f"Too few digits: {len(digits)}"
return digits, True, None
return None, False, f"Cannot extract invoice number from: {text[:50]}"
def _normalize_ocr_number(self, text: str) -> tuple[str | None, bool, str | None]:
"""Normalize OCR number."""
@@ -260,33 +319,174 @@ class FieldExtractor:
return digits, True, None
def _normalize_bankgiro(self, text: str) -> tuple[str | None, bool, str | None]:
"""Normalize Bankgiro number."""
digits = re.sub(r'\D', '', text)
def _luhn_checksum(self, digits: str) -> bool:
"""
Validate using Luhn (Mod10) algorithm.
Used for Bankgiro, Plusgiro, and OCR number validation.
if len(digits) == 8:
# Format as XXXX-XXXX
formatted = f"{digits[:4]}-{digits[4:]}"
return formatted, True, None
elif len(digits) == 7:
# Format as XXX-XXXX
formatted = f"{digits[:3]}-{digits[3:]}"
return formatted, True, None
elif 6 <= len(digits) <= 9:
return digits, True, None
else:
return None, False, f"Invalid Bankgiro length: {len(digits)}"
The checksum is valid if the total modulo 10 equals 0.
"""
if not digits.isdigit():
return False
total = 0
for i, char in enumerate(reversed(digits)):
digit = int(char)
if i % 2 == 1: # Double every second digit from right
digit *= 2
if digit > 9:
digit -= 9
total += digit
return total % 10 == 0
def _detect_giro_type(self, text: str) -> str | None:
"""
Detect if text matches BG or PG display format pattern.
BG typical format: ^\d{3,4}-\d{4}$ (e.g., 123-4567, 1234-5678)
PG typical format: ^\d{1,7}-\d$ (e.g., 1-8, 12345-6, 1234567-8)
Returns: 'BG', 'PG', or None if cannot determine
"""
text = text.strip()
# BG pattern: 3-4 digits, dash, 4 digits (total 7-8 digits)
if re.match(r'^\d{3,4}-\d{4}$', text):
return 'BG'
# PG pattern: 1-7 digits, dash, 1 digit (total 2-8 digits)
if re.match(r'^\d{1,7}-\d$', text):
return 'PG'
return None
def _normalize_bankgiro(self, text: str) -> tuple[str | None, bool, str | None]:
"""
Normalize Bankgiro number.
Bankgiro rules:
- 7 or 8 digits only
- Last digit is Luhn (Mod10) check digit
- Display format: XXX-XXXX (7 digits) or XXXX-XXXX (8 digits)
Display pattern: ^\d{3,4}-\d{4}$
Normalized pattern: ^\d{7,8}$
Note: Text may contain both BG and PG numbers. We specifically look for
BG display format (XXX-XXXX or XXXX-XXXX) to extract the correct one.
"""
# Look for BG display format pattern: 3-4 digits, dash, 4 digits
# This distinguishes BG from PG which uses X-X format (digits-single digit)
bg_matches = re.findall(r'(\d{3,4})-(\d{4})', text)
if bg_matches:
# Try each match and find one with valid Luhn
for match in bg_matches:
digits = match[0] + match[1]
if len(digits) in (7, 8) and self._luhn_checksum(digits):
# Valid BG found
if len(digits) == 8:
formatted = f"{digits[:4]}-{digits[4:]}"
else:
formatted = f"{digits[:3]}-{digits[3:]}"
return formatted, True, None
# No valid Luhn, use first match
digits = bg_matches[0][0] + bg_matches[0][1]
if len(digits) in (7, 8):
if len(digits) == 8:
formatted = f"{digits[:4]}-{digits[4:]}"
else:
formatted = f"{digits[:3]}-{digits[3:]}"
return formatted, True, f"Luhn checksum failed (possible OCR error)"
# Fallback: try to find 7-8 consecutive digits
# But first check if text contains PG format (XXXXXXX-X), if so don't use fallback
# to avoid misinterpreting PG as BG
pg_format_present = re.search(r'(?<![0-9])\d{1,7}-\d(?!\d)', text)
if pg_format_present:
return None, False, f"No valid Bankgiro found in text"
digit_match = re.search(r'\b(\d{7,8})\b', text)
if digit_match:
digits = digit_match.group(1)
if len(digits) in (7, 8):
luhn_ok = self._luhn_checksum(digits)
if len(digits) == 8:
formatted = f"{digits[:4]}-{digits[4:]}"
else:
formatted = f"{digits[:3]}-{digits[3:]}"
if luhn_ok:
return formatted, True, None
else:
return formatted, True, f"Luhn checksum failed (possible OCR error)"
return None, False, f"No valid Bankgiro found in text"
def _normalize_plusgiro(self, text: str) -> tuple[str | None, bool, str | None]:
"""Normalize Plusgiro number."""
digits = re.sub(r'\D', '', text)
"""
Normalize Plusgiro number.
if len(digits) >= 6:
# Format as XXXXXXX-X
Plusgiro rules:
- 2 to 8 digits
- Last digit is Luhn (Mod10) check digit
- Display format: XXXXXXX-X (all digits except last, dash, last digit)
Display pattern: ^\d{1,7}-\d$
Normalized pattern: ^\d{2,8}$
Note: Text may contain both BG and PG numbers. We specifically look for
PG display format (X-X, XX-X, ..., XXXXXXX-X) to extract the correct one.
"""
# First look for PG display format: 1-7 digits (possibly with spaces), dash, 1 digit
# This is distinct from BG format which has 4 digits after the dash
# Pattern allows spaces within the number like "486 98 63-6"
# (?<![0-9]) ensures we don't start from within another number (like BG)
pg_matches = re.findall(r'(?<![0-9])(\d[\d\s]{0,10})-(\d)(?!\d)', text)
if pg_matches:
# Try each match and find one with valid Luhn
for match in pg_matches:
# Remove spaces from the first part
digits = re.sub(r'\s', '', match[0]) + match[1]
if 2 <= len(digits) <= 8 and self._luhn_checksum(digits):
# Valid PG found
formatted = f"{digits[:-1]}-{digits[-1]}"
return formatted, True, None
# No valid Luhn, use first match with most digits
best_match = max(pg_matches, key=lambda m: len(re.sub(r'\s', '', m[0])))
digits = re.sub(r'\s', '', best_match[0]) + best_match[1]
if 2 <= len(digits) <= 8:
formatted = f"{digits[:-1]}-{digits[-1]}"
return formatted, True, f"Luhn checksum failed (possible OCR error)"
# If no PG format found, extract all digits and format as PG
# This handles cases where the number might be in BG format or raw digits
all_digits = re.sub(r'\D', '', text)
# Try to find a valid 2-8 digit sequence
if 2 <= len(all_digits) <= 8:
luhn_ok = self._luhn_checksum(all_digits)
formatted = f"{all_digits[:-1]}-{all_digits[-1]}"
if luhn_ok:
return formatted, True, None
else:
return formatted, True, f"Luhn checksum failed (possible OCR error)"
# Try to find any 2-8 digit sequence in text
digit_match = re.search(r'\b(\d{2,8})\b', text)
if digit_match:
digits = digit_match.group(1)
luhn_ok = self._luhn_checksum(digits)
formatted = f"{digits[:-1]}-{digits[-1]}"
return formatted, True, None
else:
return None, False, f"Invalid Plusgiro length: {len(digits)}"
if luhn_ok:
return formatted, True, None
else:
return formatted, True, f"Luhn checksum failed (possible OCR error)"
return None, False, f"No valid Plusgiro found in text"
def _normalize_amount(self, text: str) -> tuple[str | None, bool, str | None]:
"""Normalize monetary amount."""
@@ -366,6 +566,169 @@ class FieldExtractor:
return None, False, f"Cannot parse date: {text}"
def _normalize_payment_line(self, text: str) -> tuple[str | None, bool, str | None]:
"""
Normalize payment line region text.
Extracts OCR, Amount, and Bankgiro from the payment line using MachineCodeParser.
"""
from ..ocr.machine_code_parser import MachineCodeParser
# Create a simple token-like structure for the parser
# (The parser expects tokens, but for inference we have raw text)
parser = MachineCodeParser()
# Try to parse the standard payment line format
result = parser._parse_standard_payment_line(text)
if result:
# Format as structured output
parts = []
if result.get('ocr'):
parts.append(f"OCR:{result['ocr']}")
if result.get('amount'):
parts.append(f"Amount:{result['amount']}")
if result.get('bankgiro'):
parts.append(f"BG:{result['bankgiro']}")
if parts:
return ' '.join(parts), True, None
# Fallback: return raw text if no structured parsing possible
return text, True, None
def _normalize_supplier_org_number(self, text: str) -> tuple[str | None, bool, str | None]:
"""
Normalize Swedish supplier organization number.
Extracts organization number in format: NNNNNN-NNNN (10 digits)
Also handles VAT numbers: SE + 10 digits + 01
Examples:
'org.nr. 516406-1102, Filialregistret...' -> '516406-1102'
'Momsreg.nr SE556123456701' -> '556123-4567'
"""
# Pattern 1: Standard org number format: NNNNNN-NNNN
org_pattern = r'\b(\d{6})-?(\d{4})\b'
match = re.search(org_pattern, text)
if match:
org_num = f"{match.group(1)}-{match.group(2)}"
return org_num, True, None
# Pattern 2: VAT number format: SE + 10 digits + 01
vat_pattern = r'SE\s*(\d{10})01'
match = re.search(vat_pattern, text, re.IGNORECASE)
if match:
digits = match.group(1)
org_num = f"{digits[:6]}-{digits[6:]}"
return org_num, True, None
# Pattern 3: Just 10 consecutive digits
digits_pattern = r'\b(\d{10})\b'
match = re.search(digits_pattern, text)
if match:
digits = match.group(1)
# Validate: first digit should be 1-9 for Swedish org numbers
if digits[0] in '123456789':
org_num = f"{digits[:6]}-{digits[6:]}"
return org_num, True, None
return None, False, f"Cannot extract org number from: {text[:100]}"
def _normalize_customer_number(self, text: str) -> tuple[str | None, bool, str | None]:
"""
Normalize customer number extracted from OCR.
Customer numbers can have various formats:
- With separators: 'JTY 576-3', 'EMM 256-6', 'FFL 019N'
- Compact (no separators): 'JTY5763', 'EMM2566', 'FFL019N'
- Mixed with names: 'VIKSTRÖM, ELIAS CH FFL 01' -> extract 'FFL 01'
Note: Spaces and dashes may be removed from invoice display,
so we need to match both 'JTY 576-3' and 'JTY5763' formats.
"""
from ..normalize.normalizer import FieldNormalizer
# Clean the text using the same logic as matcher
text = FieldNormalizer.clean_text(text)
if not text:
return None, False, "Empty text"
# Customer number patterns - ordered by specificity
# Match both spaced/dashed versions and compact versions
customer_code_patterns = [
# Pattern: Letters + space/dash + digits + dash + digit (EMM 256-6, JTY 576-3)
r'\b([A-Z]{2,4}[\s\-]?\d{1,4}[\s\-]\d{1,2}[A-Z]?)\b',
# Pattern: Letters + space/dash + digits + optional letter (FFL 019N, ABC 123X)
r'\b([A-Z]{2,4}[\s\-]\d{2,4}[A-Z]?)\b',
# Pattern: Compact format - letters immediately followed by digits + optional letter (JTY5763, FFL019N)
r'\b([A-Z]{2,4}\d{3,6}[A-Z]?)\b',
# Pattern: Single letter + digits (A12345)
r'\b([A-Z]\d{4,6}[A-Z]?)\b',
# Pattern: Digits + dash/space + digits (123-456)
r'\b(\d{3,6}[\s\-]\d{1,4})\b',
]
all_matches = []
for pattern in customer_code_patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
all_matches.extend(matches)
if all_matches:
# Prefer longer matches and those appearing later in text (after names)
# Sort by position in text (later = better) and length (longer = better)
scored_matches = []
for match in all_matches:
pos = text.upper().rfind(match.upper())
# Score: position * 0.1 + length (prefer later and longer)
score = pos * 0.1 + len(match)
scored_matches.append((score, match))
best_match = max(scored_matches, key=lambda x: x[0])[1]
return best_match.strip().upper(), True, None
# Pattern 2: Look for explicit labels
labeled_patterns = [
r'(?:kund(?:nr|nummer|id)?|ert?\s*(?:kund)?(?:nr|nummer)?|customer\s*(?:no|number|id)?)\s*[:\.]?\s*([A-Za-z0-9][\w\s\-]{1,20}?)(?:\s{2,}|\n|$)',
]
for pattern in labeled_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
extracted = match.group(1).strip()
extracted = re.sub(r'[\s\.\,\:]+$', '', extracted)
if extracted and len(extracted) >= 2:
return extracted.upper(), True, None
# Pattern 3: If text contains comma (likely "NAME, NAME CODE"), extract after last comma
if ',' in text:
after_comma = text.split(',')[-1].strip()
# Look for alphanumeric code in the part after comma
for pattern in customer_code_patterns[:3]: # Use first 3 patterns
code_match = re.search(pattern, after_comma, re.IGNORECASE)
if code_match:
return code_match.group(1).strip().upper(), True, None
# Pattern 4: Short text - filter out name-like words
if len(text) <= 20:
words = text.split()
code_parts = []
for word in words:
# Keep if: contains digits, or is short uppercase (likely abbreviation)
if re.search(r'\d', word) or (len(word) <= 4 and word.isupper()):
code_parts.append(word)
if code_parts:
result = ' '.join(code_parts).upper()
if len(result) >= 3:
return result, True, None
# Fallback: return cleaned text if reasonable
if text and 3 <= len(text) <= 15:
return text.upper(), True, None
return None, False, f"Cannot extract customer number from: {text[:50]}"
def extract_all_fields(
self,
detections: list[Detection],