Add payment line parser and fix OCR override from payment_line
- Add MachineCodeParser for Swedish invoice payment line parsing - Fix OCR Reference extraction by normalizing account number spaces - Add cross-validation tests for pipeline and field_extractor - Update UI layout for compact upload and full-width results Key changes: - machine_code_parser.py: Handle spaces in Bankgiro numbers (e.g. "78 2 1 713") - pipeline.py: OCR and Amount override from payment_line, BG/PG comparison only - field_extractor.py: Improved invoice number normalization - app.py: Responsive UI layout changes Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -238,18 +238,77 @@ class FieldExtractor:
|
||||
elif field_name in ('InvoiceDate', 'InvoiceDueDate'):
|
||||
return self._normalize_date(text)
|
||||
|
||||
elif field_name == 'payment_line':
|
||||
return self._normalize_payment_line(text)
|
||||
|
||||
elif field_name == 'supplier_org_number':
|
||||
return self._normalize_supplier_org_number(text)
|
||||
|
||||
elif field_name == 'customer_number':
|
||||
return self._normalize_customer_number(text)
|
||||
|
||||
else:
|
||||
return text, True, None
|
||||
|
||||
def _normalize_invoice_number(self, text: str) -> tuple[str | None, bool, str | None]:
|
||||
"""Normalize invoice number."""
|
||||
# Extract digits only
|
||||
"""
|
||||
Normalize invoice number.
|
||||
|
||||
Invoice numbers can be:
|
||||
- Pure digits: 12345678
|
||||
- Alphanumeric: A3861, INV-2024-001, F12345
|
||||
- With separators: 2024/001, 2024-001
|
||||
|
||||
Strategy:
|
||||
1. Look for common invoice number patterns
|
||||
2. Prefer shorter, more specific matches over long digit sequences
|
||||
"""
|
||||
# Pattern 1: Alphanumeric invoice number (letter + digits or digits + letter)
|
||||
# Examples: A3861, F12345, INV001
|
||||
alpha_patterns = [
|
||||
r'\b([A-Z]{1,3}\d{3,10})\b', # A3861, INV12345
|
||||
r'\b(\d{3,10}[A-Z]{1,3})\b', # 12345A
|
||||
r'\b([A-Z]{2,5}[-/]?\d{3,10})\b', # INV-12345, FAK12345
|
||||
]
|
||||
|
||||
for pattern in alpha_patterns:
|
||||
match = re.search(pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1).upper(), True, None
|
||||
|
||||
# Pattern 2: Invoice number with year prefix (2024-001, 2024/12345)
|
||||
year_pattern = r'\b(20\d{2}[-/]\d{3,8})\b'
|
||||
match = re.search(year_pattern, text)
|
||||
if match:
|
||||
return match.group(1), True, None
|
||||
|
||||
# Pattern 3: Short digit sequence (3-10 digits) - prefer shorter sequences
|
||||
# This avoids capturing long OCR numbers
|
||||
digit_sequences = re.findall(r'\b(\d{3,10})\b', text)
|
||||
if digit_sequences:
|
||||
# Prefer shorter sequences (more likely to be invoice number)
|
||||
# Also filter out sequences that look like dates (8 digits starting with 20)
|
||||
valid_sequences = []
|
||||
for seq in digit_sequences:
|
||||
# Skip if it looks like a date (YYYYMMDD)
|
||||
if len(seq) == 8 and seq.startswith('20'):
|
||||
continue
|
||||
# Skip if too long (likely OCR number)
|
||||
if len(seq) > 10:
|
||||
continue
|
||||
valid_sequences.append(seq)
|
||||
|
||||
if valid_sequences:
|
||||
# Return shortest valid sequence
|
||||
return min(valid_sequences, key=len), True, None
|
||||
|
||||
# Fallback: extract all digits if nothing else works
|
||||
digits = re.sub(r'\D', '', text)
|
||||
if len(digits) >= 3:
|
||||
# Limit to first 15 digits to avoid very long sequences
|
||||
return digits[:15], True, "Fallback extraction"
|
||||
|
||||
if len(digits) < 3:
|
||||
return None, False, f"Too few digits: {len(digits)}"
|
||||
|
||||
return digits, True, None
|
||||
return None, False, f"Cannot extract invoice number from: {text[:50]}"
|
||||
|
||||
def _normalize_ocr_number(self, text: str) -> tuple[str | None, bool, str | None]:
|
||||
"""Normalize OCR number."""
|
||||
@@ -260,33 +319,174 @@ class FieldExtractor:
|
||||
|
||||
return digits, True, None
|
||||
|
||||
def _normalize_bankgiro(self, text: str) -> tuple[str | None, bool, str | None]:
|
||||
"""Normalize Bankgiro number."""
|
||||
digits = re.sub(r'\D', '', text)
|
||||
def _luhn_checksum(self, digits: str) -> bool:
|
||||
"""
|
||||
Validate using Luhn (Mod10) algorithm.
|
||||
Used for Bankgiro, Plusgiro, and OCR number validation.
|
||||
|
||||
if len(digits) == 8:
|
||||
# Format as XXXX-XXXX
|
||||
formatted = f"{digits[:4]}-{digits[4:]}"
|
||||
return formatted, True, None
|
||||
elif len(digits) == 7:
|
||||
# Format as XXX-XXXX
|
||||
formatted = f"{digits[:3]}-{digits[3:]}"
|
||||
return formatted, True, None
|
||||
elif 6 <= len(digits) <= 9:
|
||||
return digits, True, None
|
||||
else:
|
||||
return None, False, f"Invalid Bankgiro length: {len(digits)}"
|
||||
The checksum is valid if the total modulo 10 equals 0.
|
||||
"""
|
||||
if not digits.isdigit():
|
||||
return False
|
||||
|
||||
total = 0
|
||||
for i, char in enumerate(reversed(digits)):
|
||||
digit = int(char)
|
||||
if i % 2 == 1: # Double every second digit from right
|
||||
digit *= 2
|
||||
if digit > 9:
|
||||
digit -= 9
|
||||
total += digit
|
||||
|
||||
return total % 10 == 0
|
||||
|
||||
def _detect_giro_type(self, text: str) -> str | None:
|
||||
"""
|
||||
Detect if text matches BG or PG display format pattern.
|
||||
|
||||
BG typical format: ^\d{3,4}-\d{4}$ (e.g., 123-4567, 1234-5678)
|
||||
PG typical format: ^\d{1,7}-\d$ (e.g., 1-8, 12345-6, 1234567-8)
|
||||
|
||||
Returns: 'BG', 'PG', or None if cannot determine
|
||||
"""
|
||||
text = text.strip()
|
||||
|
||||
# BG pattern: 3-4 digits, dash, 4 digits (total 7-8 digits)
|
||||
if re.match(r'^\d{3,4}-\d{4}$', text):
|
||||
return 'BG'
|
||||
|
||||
# PG pattern: 1-7 digits, dash, 1 digit (total 2-8 digits)
|
||||
if re.match(r'^\d{1,7}-\d$', text):
|
||||
return 'PG'
|
||||
|
||||
return None
|
||||
|
||||
def _normalize_bankgiro(self, text: str) -> tuple[str | None, bool, str | None]:
|
||||
"""
|
||||
Normalize Bankgiro number.
|
||||
|
||||
Bankgiro rules:
|
||||
- 7 or 8 digits only
|
||||
- Last digit is Luhn (Mod10) check digit
|
||||
- Display format: XXX-XXXX (7 digits) or XXXX-XXXX (8 digits)
|
||||
|
||||
Display pattern: ^\d{3,4}-\d{4}$
|
||||
Normalized pattern: ^\d{7,8}$
|
||||
|
||||
Note: Text may contain both BG and PG numbers. We specifically look for
|
||||
BG display format (XXX-XXXX or XXXX-XXXX) to extract the correct one.
|
||||
"""
|
||||
# Look for BG display format pattern: 3-4 digits, dash, 4 digits
|
||||
# This distinguishes BG from PG which uses X-X format (digits-single digit)
|
||||
bg_matches = re.findall(r'(\d{3,4})-(\d{4})', text)
|
||||
|
||||
if bg_matches:
|
||||
# Try each match and find one with valid Luhn
|
||||
for match in bg_matches:
|
||||
digits = match[0] + match[1]
|
||||
if len(digits) in (7, 8) and self._luhn_checksum(digits):
|
||||
# Valid BG found
|
||||
if len(digits) == 8:
|
||||
formatted = f"{digits[:4]}-{digits[4:]}"
|
||||
else:
|
||||
formatted = f"{digits[:3]}-{digits[3:]}"
|
||||
return formatted, True, None
|
||||
|
||||
# No valid Luhn, use first match
|
||||
digits = bg_matches[0][0] + bg_matches[0][1]
|
||||
if len(digits) in (7, 8):
|
||||
if len(digits) == 8:
|
||||
formatted = f"{digits[:4]}-{digits[4:]}"
|
||||
else:
|
||||
formatted = f"{digits[:3]}-{digits[3:]}"
|
||||
return formatted, True, f"Luhn checksum failed (possible OCR error)"
|
||||
|
||||
# Fallback: try to find 7-8 consecutive digits
|
||||
# But first check if text contains PG format (XXXXXXX-X), if so don't use fallback
|
||||
# to avoid misinterpreting PG as BG
|
||||
pg_format_present = re.search(r'(?<![0-9])\d{1,7}-\d(?!\d)', text)
|
||||
if pg_format_present:
|
||||
return None, False, f"No valid Bankgiro found in text"
|
||||
|
||||
digit_match = re.search(r'\b(\d{7,8})\b', text)
|
||||
if digit_match:
|
||||
digits = digit_match.group(1)
|
||||
if len(digits) in (7, 8):
|
||||
luhn_ok = self._luhn_checksum(digits)
|
||||
if len(digits) == 8:
|
||||
formatted = f"{digits[:4]}-{digits[4:]}"
|
||||
else:
|
||||
formatted = f"{digits[:3]}-{digits[3:]}"
|
||||
if luhn_ok:
|
||||
return formatted, True, None
|
||||
else:
|
||||
return formatted, True, f"Luhn checksum failed (possible OCR error)"
|
||||
|
||||
return None, False, f"No valid Bankgiro found in text"
|
||||
|
||||
def _normalize_plusgiro(self, text: str) -> tuple[str | None, bool, str | None]:
|
||||
"""Normalize Plusgiro number."""
|
||||
digits = re.sub(r'\D', '', text)
|
||||
"""
|
||||
Normalize Plusgiro number.
|
||||
|
||||
if len(digits) >= 6:
|
||||
# Format as XXXXXXX-X
|
||||
Plusgiro rules:
|
||||
- 2 to 8 digits
|
||||
- Last digit is Luhn (Mod10) check digit
|
||||
- Display format: XXXXXXX-X (all digits except last, dash, last digit)
|
||||
|
||||
Display pattern: ^\d{1,7}-\d$
|
||||
Normalized pattern: ^\d{2,8}$
|
||||
|
||||
Note: Text may contain both BG and PG numbers. We specifically look for
|
||||
PG display format (X-X, XX-X, ..., XXXXXXX-X) to extract the correct one.
|
||||
"""
|
||||
# First look for PG display format: 1-7 digits (possibly with spaces), dash, 1 digit
|
||||
# This is distinct from BG format which has 4 digits after the dash
|
||||
# Pattern allows spaces within the number like "486 98 63-6"
|
||||
# (?<![0-9]) ensures we don't start from within another number (like BG)
|
||||
pg_matches = re.findall(r'(?<![0-9])(\d[\d\s]{0,10})-(\d)(?!\d)', text)
|
||||
|
||||
if pg_matches:
|
||||
# Try each match and find one with valid Luhn
|
||||
for match in pg_matches:
|
||||
# Remove spaces from the first part
|
||||
digits = re.sub(r'\s', '', match[0]) + match[1]
|
||||
if 2 <= len(digits) <= 8 and self._luhn_checksum(digits):
|
||||
# Valid PG found
|
||||
formatted = f"{digits[:-1]}-{digits[-1]}"
|
||||
return formatted, True, None
|
||||
|
||||
# No valid Luhn, use first match with most digits
|
||||
best_match = max(pg_matches, key=lambda m: len(re.sub(r'\s', '', m[0])))
|
||||
digits = re.sub(r'\s', '', best_match[0]) + best_match[1]
|
||||
if 2 <= len(digits) <= 8:
|
||||
formatted = f"{digits[:-1]}-{digits[-1]}"
|
||||
return formatted, True, f"Luhn checksum failed (possible OCR error)"
|
||||
|
||||
# If no PG format found, extract all digits and format as PG
|
||||
# This handles cases where the number might be in BG format or raw digits
|
||||
all_digits = re.sub(r'\D', '', text)
|
||||
|
||||
# Try to find a valid 2-8 digit sequence
|
||||
if 2 <= len(all_digits) <= 8:
|
||||
luhn_ok = self._luhn_checksum(all_digits)
|
||||
formatted = f"{all_digits[:-1]}-{all_digits[-1]}"
|
||||
if luhn_ok:
|
||||
return formatted, True, None
|
||||
else:
|
||||
return formatted, True, f"Luhn checksum failed (possible OCR error)"
|
||||
|
||||
# Try to find any 2-8 digit sequence in text
|
||||
digit_match = re.search(r'\b(\d{2,8})\b', text)
|
||||
if digit_match:
|
||||
digits = digit_match.group(1)
|
||||
luhn_ok = self._luhn_checksum(digits)
|
||||
formatted = f"{digits[:-1]}-{digits[-1]}"
|
||||
return formatted, True, None
|
||||
else:
|
||||
return None, False, f"Invalid Plusgiro length: {len(digits)}"
|
||||
if luhn_ok:
|
||||
return formatted, True, None
|
||||
else:
|
||||
return formatted, True, f"Luhn checksum failed (possible OCR error)"
|
||||
|
||||
return None, False, f"No valid Plusgiro found in text"
|
||||
|
||||
def _normalize_amount(self, text: str) -> tuple[str | None, bool, str | None]:
|
||||
"""Normalize monetary amount."""
|
||||
@@ -366,6 +566,169 @@ class FieldExtractor:
|
||||
|
||||
return None, False, f"Cannot parse date: {text}"
|
||||
|
||||
def _normalize_payment_line(self, text: str) -> tuple[str | None, bool, str | None]:
|
||||
"""
|
||||
Normalize payment line region text.
|
||||
|
||||
Extracts OCR, Amount, and Bankgiro from the payment line using MachineCodeParser.
|
||||
"""
|
||||
from ..ocr.machine_code_parser import MachineCodeParser
|
||||
|
||||
# Create a simple token-like structure for the parser
|
||||
# (The parser expects tokens, but for inference we have raw text)
|
||||
parser = MachineCodeParser()
|
||||
|
||||
# Try to parse the standard payment line format
|
||||
result = parser._parse_standard_payment_line(text)
|
||||
|
||||
if result:
|
||||
# Format as structured output
|
||||
parts = []
|
||||
if result.get('ocr'):
|
||||
parts.append(f"OCR:{result['ocr']}")
|
||||
if result.get('amount'):
|
||||
parts.append(f"Amount:{result['amount']}")
|
||||
if result.get('bankgiro'):
|
||||
parts.append(f"BG:{result['bankgiro']}")
|
||||
|
||||
if parts:
|
||||
return ' '.join(parts), True, None
|
||||
|
||||
# Fallback: return raw text if no structured parsing possible
|
||||
return text, True, None
|
||||
|
||||
def _normalize_supplier_org_number(self, text: str) -> tuple[str | None, bool, str | None]:
|
||||
"""
|
||||
Normalize Swedish supplier organization number.
|
||||
|
||||
Extracts organization number in format: NNNNNN-NNNN (10 digits)
|
||||
Also handles VAT numbers: SE + 10 digits + 01
|
||||
|
||||
Examples:
|
||||
'org.nr. 516406-1102, Filialregistret...' -> '516406-1102'
|
||||
'Momsreg.nr SE556123456701' -> '556123-4567'
|
||||
"""
|
||||
# Pattern 1: Standard org number format: NNNNNN-NNNN
|
||||
org_pattern = r'\b(\d{6})-?(\d{4})\b'
|
||||
match = re.search(org_pattern, text)
|
||||
if match:
|
||||
org_num = f"{match.group(1)}-{match.group(2)}"
|
||||
return org_num, True, None
|
||||
|
||||
# Pattern 2: VAT number format: SE + 10 digits + 01
|
||||
vat_pattern = r'SE\s*(\d{10})01'
|
||||
match = re.search(vat_pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
digits = match.group(1)
|
||||
org_num = f"{digits[:6]}-{digits[6:]}"
|
||||
return org_num, True, None
|
||||
|
||||
# Pattern 3: Just 10 consecutive digits
|
||||
digits_pattern = r'\b(\d{10})\b'
|
||||
match = re.search(digits_pattern, text)
|
||||
if match:
|
||||
digits = match.group(1)
|
||||
# Validate: first digit should be 1-9 for Swedish org numbers
|
||||
if digits[0] in '123456789':
|
||||
org_num = f"{digits[:6]}-{digits[6:]}"
|
||||
return org_num, True, None
|
||||
|
||||
return None, False, f"Cannot extract org number from: {text[:100]}"
|
||||
|
||||
def _normalize_customer_number(self, text: str) -> tuple[str | None, bool, str | None]:
|
||||
"""
|
||||
Normalize customer number extracted from OCR.
|
||||
|
||||
Customer numbers can have various formats:
|
||||
- With separators: 'JTY 576-3', 'EMM 256-6', 'FFL 019N'
|
||||
- Compact (no separators): 'JTY5763', 'EMM2566', 'FFL019N'
|
||||
- Mixed with names: 'VIKSTRÖM, ELIAS CH FFL 01' -> extract 'FFL 01'
|
||||
|
||||
Note: Spaces and dashes may be removed from invoice display,
|
||||
so we need to match both 'JTY 576-3' and 'JTY5763' formats.
|
||||
"""
|
||||
from ..normalize.normalizer import FieldNormalizer
|
||||
|
||||
# Clean the text using the same logic as matcher
|
||||
text = FieldNormalizer.clean_text(text)
|
||||
|
||||
if not text:
|
||||
return None, False, "Empty text"
|
||||
|
||||
# Customer number patterns - ordered by specificity
|
||||
# Match both spaced/dashed versions and compact versions
|
||||
customer_code_patterns = [
|
||||
# Pattern: Letters + space/dash + digits + dash + digit (EMM 256-6, JTY 576-3)
|
||||
r'\b([A-Z]{2,4}[\s\-]?\d{1,4}[\s\-]\d{1,2}[A-Z]?)\b',
|
||||
# Pattern: Letters + space/dash + digits + optional letter (FFL 019N, ABC 123X)
|
||||
r'\b([A-Z]{2,4}[\s\-]\d{2,4}[A-Z]?)\b',
|
||||
# Pattern: Compact format - letters immediately followed by digits + optional letter (JTY5763, FFL019N)
|
||||
r'\b([A-Z]{2,4}\d{3,6}[A-Z]?)\b',
|
||||
# Pattern: Single letter + digits (A12345)
|
||||
r'\b([A-Z]\d{4,6}[A-Z]?)\b',
|
||||
# Pattern: Digits + dash/space + digits (123-456)
|
||||
r'\b(\d{3,6}[\s\-]\d{1,4})\b',
|
||||
]
|
||||
|
||||
all_matches = []
|
||||
for pattern in customer_code_patterns:
|
||||
matches = re.findall(pattern, text, re.IGNORECASE)
|
||||
all_matches.extend(matches)
|
||||
|
||||
if all_matches:
|
||||
# Prefer longer matches and those appearing later in text (after names)
|
||||
# Sort by position in text (later = better) and length (longer = better)
|
||||
scored_matches = []
|
||||
for match in all_matches:
|
||||
pos = text.upper().rfind(match.upper())
|
||||
# Score: position * 0.1 + length (prefer later and longer)
|
||||
score = pos * 0.1 + len(match)
|
||||
scored_matches.append((score, match))
|
||||
|
||||
best_match = max(scored_matches, key=lambda x: x[0])[1]
|
||||
return best_match.strip().upper(), True, None
|
||||
|
||||
# Pattern 2: Look for explicit labels
|
||||
labeled_patterns = [
|
||||
r'(?:kund(?:nr|nummer|id)?|ert?\s*(?:kund)?(?:nr|nummer)?|customer\s*(?:no|number|id)?)\s*[:\.]?\s*([A-Za-z0-9][\w\s\-]{1,20}?)(?:\s{2,}|\n|$)',
|
||||
]
|
||||
|
||||
for pattern in labeled_patterns:
|
||||
match = re.search(pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
extracted = match.group(1).strip()
|
||||
extracted = re.sub(r'[\s\.\,\:]+$', '', extracted)
|
||||
if extracted and len(extracted) >= 2:
|
||||
return extracted.upper(), True, None
|
||||
|
||||
# Pattern 3: If text contains comma (likely "NAME, NAME CODE"), extract after last comma
|
||||
if ',' in text:
|
||||
after_comma = text.split(',')[-1].strip()
|
||||
# Look for alphanumeric code in the part after comma
|
||||
for pattern in customer_code_patterns[:3]: # Use first 3 patterns
|
||||
code_match = re.search(pattern, after_comma, re.IGNORECASE)
|
||||
if code_match:
|
||||
return code_match.group(1).strip().upper(), True, None
|
||||
|
||||
# Pattern 4: Short text - filter out name-like words
|
||||
if len(text) <= 20:
|
||||
words = text.split()
|
||||
code_parts = []
|
||||
for word in words:
|
||||
# Keep if: contains digits, or is short uppercase (likely abbreviation)
|
||||
if re.search(r'\d', word) or (len(word) <= 4 and word.isupper()):
|
||||
code_parts.append(word)
|
||||
if code_parts:
|
||||
result = ' '.join(code_parts).upper()
|
||||
if len(result) >= 3:
|
||||
return result, True, None
|
||||
|
||||
# Fallback: return cleaned text if reasonable
|
||||
if text and 3 <= len(text) <= 15:
|
||||
return text.upper(), True, None
|
||||
|
||||
return None, False, f"Cannot extract customer number from: {text[:50]}"
|
||||
|
||||
def extract_all_fields(
|
||||
self,
|
||||
detections: list[Detection],
|
||||
|
||||
Reference in New Issue
Block a user