Add payment line parser and fix OCR override from payment_line

- Add MachineCodeParser for Swedish invoice payment line parsing - Fix OCR Reference extraction by normalizing account number spaces - Add cross-validation tests for pipeline and field_extractor - Update UI layout for compact upload and full-width results Key changes: - machine_code_parser.py: Handle spaces in Bankgiro numbers (e.g. "78 2 1 713") - pipeline.py: OCR and Amount override from payment_line, BG/PG comparison only - field_extractor.py: Improved invoice number normalization - app.py: Responsive UI layout changes Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-21 21:47:02 +01:00
parent e9460e9f34
commit 4ea4bc96d4
33 changed files with 7530 additions and 562 deletions
--- a/src/inference/field_extractor.py
+++ b/src/inference/field_extractor.py
@@ -238,18 +238,77 @@ class FieldExtractor:
        elif field_name in ('InvoiceDate', 'InvoiceDueDate'):
            return self._normalize_date(text)

+        elif field_name == 'payment_line':
+            return self._normalize_payment_line(text)
+
+        elif field_name == 'supplier_org_number':
+            return self._normalize_supplier_org_number(text)
+
+        elif field_name == 'customer_number':
+            return self._normalize_customer_number(text)
+
        else:
            return text, True, None

    def _normalize_invoice_number(self, text: str) -> tuple[str | None, bool, str | None]:
-        """Normalize invoice number."""
-        # Extract digits only
+        """
+        Normalize invoice number.
+
+        Invoice numbers can be:
+        - Pure digits: 12345678
+        - Alphanumeric: A3861, INV-2024-001, F12345
+        - With separators: 2024/001, 2024-001
+
+        Strategy:
+        1. Look for common invoice number patterns
+        2. Prefer shorter, more specific matches over long digit sequences
+        """
+        # Pattern 1: Alphanumeric invoice number (letter + digits or digits + letter)
+        # Examples: A3861, F12345, INV001
+        alpha_patterns = [
+            r'\b([A-Z]{1,3}\d{3,10})\b',  # A3861, INV12345
+            r'\b(\d{3,10}[A-Z]{1,3})\b',  # 12345A
+            r'\b([A-Z]{2,5}[-/]?\d{3,10})\b',  # INV-12345, FAK12345
+        ]
+
+        for pattern in alpha_patterns:
+            match = re.search(pattern, text, re.IGNORECASE)
+            if match:
+                return match.group(1).upper(), True, None
+
+        # Pattern 2: Invoice number with year prefix (2024-001, 2024/12345)
+        year_pattern = r'\b(20\d{2}[-/]\d{3,8})\b'
+        match = re.search(year_pattern, text)
+        if match:
+            return match.group(1), True, None
+
+        # Pattern 3: Short digit sequence (3-10 digits) - prefer shorter sequences
+        # This avoids capturing long OCR numbers
+        digit_sequences = re.findall(r'\b(\d{3,10})\b', text)
+        if digit_sequences:
+            # Prefer shorter sequences (more likely to be invoice number)
+            # Also filter out sequences that look like dates (8 digits starting with 20)
+            valid_sequences = []
+            for seq in digit_sequences:
+                # Skip if it looks like a date (YYYYMMDD)
+                if len(seq) == 8 and seq.startswith('20'):
+                    continue
+                # Skip if too long (likely OCR number)
+                if len(seq) > 10:
+                    continue
+                valid_sequences.append(seq)
+
+            if valid_sequences:
+                # Return shortest valid sequence
+                return min(valid_sequences, key=len), True, None
+
+        # Fallback: extract all digits if nothing else works
        digits = re.sub(r'\D', '', text)
+        if len(digits) >= 3:
+            # Limit to first 15 digits to avoid very long sequences
+            return digits[:15], True, "Fallback extraction"

-        if len(digits) < 3:
-            return None, False, f"Too few digits: {len(digits)}"
-
-        return digits, True, None
+        return None, False, f"Cannot extract invoice number from: {text[:50]}"

    def _normalize_ocr_number(self, text: str) -> tuple[str | None, bool, str | None]:
        """Normalize OCR number."""
@@ -260,33 +319,174 @@ class FieldExtractor:

        return digits, True, None

-    def _normalize_bankgiro(self, text: str) -> tuple[str | None, bool, str | None]:
-        """Normalize Bankgiro number."""
-        digits = re.sub(r'\D', '', text)
+    def _luhn_checksum(self, digits: str) -> bool:
+        """
+        Validate using Luhn (Mod10) algorithm.
+        Used for Bankgiro, Plusgiro, and OCR number validation.

-        if len(digits) == 8:
-            # Format as XXXX-XXXX
-            formatted = f"{digits[:4]}-{digits[4:]}"
-            return formatted, True, None
-        elif len(digits) == 7:
-            # Format as XXX-XXXX
-            formatted = f"{digits[:3]}-{digits[3:]}"
-            return formatted, True, None
-        elif 6 <= len(digits) <= 9:
-            return digits, True, None
-        else:
-            return None, False, f"Invalid Bankgiro length: {len(digits)}"
+        The checksum is valid if the total modulo 10 equals 0.
+        """
+        if not digits.isdigit():
+            return False
+
+        total = 0
+        for i, char in enumerate(reversed(digits)):
+            digit = int(char)
+            if i % 2 == 1:  # Double every second digit from right
+                digit *= 2
+                if digit > 9:
+                    digit -= 9
+            total += digit
+
+        return total % 10 == 0
+
+    def _detect_giro_type(self, text: str) -> str | None:
+        """
+        Detect if text matches BG or PG display format pattern.
+
+        BG typical format: ^\d{3,4}-\d{4}$  (e.g., 123-4567, 1234-5678)
+        PG typical format: ^\d{1,7}-\d$     (e.g., 1-8, 12345-6, 1234567-8)
+
+        Returns: 'BG', 'PG', or None if cannot determine
+        """
+        text = text.strip()
+
+        # BG pattern: 3-4 digits, dash, 4 digits (total 7-8 digits)
+        if re.match(r'^\d{3,4}-\d{4}$', text):
+            return 'BG'
+
+        # PG pattern: 1-7 digits, dash, 1 digit (total 2-8 digits)
+        if re.match(r'^\d{1,7}-\d$', text):
+            return 'PG'
+
+        return None
+
+    def _normalize_bankgiro(self, text: str) -> tuple[str | None, bool, str | None]:
+        """
+        Normalize Bankgiro number.
+
+        Bankgiro rules:
+        - 7 or 8 digits only
+        - Last digit is Luhn (Mod10) check digit
+        - Display format: XXX-XXXX (7 digits) or XXXX-XXXX (8 digits)
+
+        Display pattern: ^\d{3,4}-\d{4}$
+        Normalized pattern: ^\d{7,8}$
+
+        Note: Text may contain both BG and PG numbers. We specifically look for
+        BG display format (XXX-XXXX or XXXX-XXXX) to extract the correct one.
+        """
+        # Look for BG display format pattern: 3-4 digits, dash, 4 digits
+        # This distinguishes BG from PG which uses X-X format (digits-single digit)
+        bg_matches = re.findall(r'(\d{3,4})-(\d{4})', text)
+
+        if bg_matches:
+            # Try each match and find one with valid Luhn
+            for match in bg_matches:
+                digits = match[0] + match[1]
+                if len(digits) in (7, 8) and self._luhn_checksum(digits):
+                    # Valid BG found
+                    if len(digits) == 8:
+                        formatted = f"{digits[:4]}-{digits[4:]}"
+                    else:
+                        formatted = f"{digits[:3]}-{digits[3:]}"
+                    return formatted, True, None
+
+            # No valid Luhn, use first match
+            digits = bg_matches[0][0] + bg_matches[0][1]
+            if len(digits) in (7, 8):
+                if len(digits) == 8:
+                    formatted = f"{digits[:4]}-{digits[4:]}"
+                else:
+                    formatted = f"{digits[:3]}-{digits[3:]}"
+                return formatted, True, f"Luhn checksum failed (possible OCR error)"
+
+        # Fallback: try to find 7-8 consecutive digits
+        # But first check if text contains PG format (XXXXXXX-X), if so don't use fallback
+        # to avoid misinterpreting PG as BG
+        pg_format_present = re.search(r'(?<![0-9])\d{1,7}-\d(?!\d)', text)
+        if pg_format_present:
+            return None, False, f"No valid Bankgiro found in text"
+
+        digit_match = re.search(r'\b(\d{7,8})\b', text)
+        if digit_match:
+            digits = digit_match.group(1)
+            if len(digits) in (7, 8):
+                luhn_ok = self._luhn_checksum(digits)
+                if len(digits) == 8:
+                    formatted = f"{digits[:4]}-{digits[4:]}"
+                else:
+                    formatted = f"{digits[:3]}-{digits[3:]}"
+                if luhn_ok:
+                    return formatted, True, None
+                else:
+                    return formatted, True, f"Luhn checksum failed (possible OCR error)"
+
+        return None, False, f"No valid Bankgiro found in text"

    def _normalize_plusgiro(self, text: str) -> tuple[str | None, bool, str | None]:
-        """Normalize Plusgiro number."""
-        digits = re.sub(r'\D', '', text)
+        """
+        Normalize Plusgiro number.

-        if len(digits) >= 6:
-            # Format as XXXXXXX-X
+        Plusgiro rules:
+        - 2 to 8 digits
+        - Last digit is Luhn (Mod10) check digit
+        - Display format: XXXXXXX-X (all digits except last, dash, last digit)
+
+        Display pattern: ^\d{1,7}-\d$
+        Normalized pattern: ^\d{2,8}$
+
+        Note: Text may contain both BG and PG numbers. We specifically look for
+        PG display format (X-X, XX-X, ..., XXXXXXX-X) to extract the correct one.
+        """
+        # First look for PG display format: 1-7 digits (possibly with spaces), dash, 1 digit
+        # This is distinct from BG format which has 4 digits after the dash
+        # Pattern allows spaces within the number like "486 98 63-6"
+        # (?<![0-9]) ensures we don't start from within another number (like BG)
+        pg_matches = re.findall(r'(?<![0-9])(\d[\d\s]{0,10})-(\d)(?!\d)', text)
+
+        if pg_matches:
+            # Try each match and find one with valid Luhn
+            for match in pg_matches:
+                # Remove spaces from the first part
+                digits = re.sub(r'\s', '', match[0]) + match[1]
+                if 2 <= len(digits) <= 8 and self._luhn_checksum(digits):
+                    # Valid PG found
+                    formatted = f"{digits[:-1]}-{digits[-1]}"
+                    return formatted, True, None
+
+            # No valid Luhn, use first match with most digits
+            best_match = max(pg_matches, key=lambda m: len(re.sub(r'\s', '', m[0])))
+            digits = re.sub(r'\s', '', best_match[0]) + best_match[1]
+            if 2 <= len(digits) <= 8:
+                formatted = f"{digits[:-1]}-{digits[-1]}"
+                return formatted, True, f"Luhn checksum failed (possible OCR error)"
+
+        # If no PG format found, extract all digits and format as PG
+        # This handles cases where the number might be in BG format or raw digits
+        all_digits = re.sub(r'\D', '', text)
+
+        # Try to find a valid 2-8 digit sequence
+        if 2 <= len(all_digits) <= 8:
+            luhn_ok = self._luhn_checksum(all_digits)
+            formatted = f"{all_digits[:-1]}-{all_digits[-1]}"
+            if luhn_ok:
+                return formatted, True, None
+            else:
+                return formatted, True, f"Luhn checksum failed (possible OCR error)"
+
+        # Try to find any 2-8 digit sequence in text
+        digit_match = re.search(r'\b(\d{2,8})\b', text)
+        if digit_match:
+            digits = digit_match.group(1)
+            luhn_ok = self._luhn_checksum(digits)
            formatted = f"{digits[:-1]}-{digits[-1]}"
-            return formatted, True, None
-        else:
-            return None, False, f"Invalid Plusgiro length: {len(digits)}"
+            if luhn_ok:
+                return formatted, True, None
+            else:
+                return formatted, True, f"Luhn checksum failed (possible OCR error)"
+
+        return None, False, f"No valid Plusgiro found in text"

    def _normalize_amount(self, text: str) -> tuple[str | None, bool, str | None]:
        """Normalize monetary amount."""
@@ -366,6 +566,169 @@ class FieldExtractor:

        return None, False, f"Cannot parse date: {text}"

+    def _normalize_payment_line(self, text: str) -> tuple[str | None, bool, str | None]:
+        """
+        Normalize payment line region text.
+
+        Extracts OCR, Amount, and Bankgiro from the payment line using MachineCodeParser.
+        """
+        from ..ocr.machine_code_parser import MachineCodeParser
+
+        # Create a simple token-like structure for the parser
+        # (The parser expects tokens, but for inference we have raw text)
+        parser = MachineCodeParser()
+
+        # Try to parse the standard payment line format
+        result = parser._parse_standard_payment_line(text)
+
+        if result:
+            # Format as structured output
+            parts = []
+            if result.get('ocr'):
+                parts.append(f"OCR:{result['ocr']}")
+            if result.get('amount'):
+                parts.append(f"Amount:{result['amount']}")
+            if result.get('bankgiro'):
+                parts.append(f"BG:{result['bankgiro']}")
+
+            if parts:
+                return ' '.join(parts), True, None
+
+        # Fallback: return raw text if no structured parsing possible
+        return text, True, None
+
+    def _normalize_supplier_org_number(self, text: str) -> tuple[str | None, bool, str | None]:
+        """
+        Normalize Swedish supplier organization number.
+
+        Extracts organization number in format: NNNNNN-NNNN (10 digits)
+        Also handles VAT numbers: SE + 10 digits + 01
+
+        Examples:
+            'org.nr. 516406-1102, Filialregistret...' -> '516406-1102'
+            'Momsreg.nr SE556123456701' -> '556123-4567'
+        """
+        # Pattern 1: Standard org number format: NNNNNN-NNNN
+        org_pattern = r'\b(\d{6})-?(\d{4})\b'
+        match = re.search(org_pattern, text)
+        if match:
+            org_num = f"{match.group(1)}-{match.group(2)}"
+            return org_num, True, None
+
+        # Pattern 2: VAT number format: SE + 10 digits + 01
+        vat_pattern = r'SE\s*(\d{10})01'
+        match = re.search(vat_pattern, text, re.IGNORECASE)
+        if match:
+            digits = match.group(1)
+            org_num = f"{digits[:6]}-{digits[6:]}"
+            return org_num, True, None
+
+        # Pattern 3: Just 10 consecutive digits
+        digits_pattern = r'\b(\d{10})\b'
+        match = re.search(digits_pattern, text)
+        if match:
+            digits = match.group(1)
+            # Validate: first digit should be 1-9 for Swedish org numbers
+            if digits[0] in '123456789':
+                org_num = f"{digits[:6]}-{digits[6:]}"
+                return org_num, True, None
+
+        return None, False, f"Cannot extract org number from: {text[:100]}"
+
+    def _normalize_customer_number(self, text: str) -> tuple[str | None, bool, str | None]:
+        """
+        Normalize customer number extracted from OCR.
+
+        Customer numbers can have various formats:
+        - With separators: 'JTY 576-3', 'EMM 256-6', 'FFL 019N'
+        - Compact (no separators): 'JTY5763', 'EMM2566', 'FFL019N'
+        - Mixed with names: 'VIKSTRÖM, ELIAS CH FFL 01' -> extract 'FFL 01'
+
+        Note: Spaces and dashes may be removed from invoice display,
+        so we need to match both 'JTY 576-3' and 'JTY5763' formats.
+        """
+        from ..normalize.normalizer import FieldNormalizer
+
+        # Clean the text using the same logic as matcher
+        text = FieldNormalizer.clean_text(text)
+
+        if not text:
+            return None, False, "Empty text"
+
+        # Customer number patterns - ordered by specificity
+        # Match both spaced/dashed versions and compact versions
+        customer_code_patterns = [
+            # Pattern: Letters + space/dash + digits + dash + digit (EMM 256-6, JTY 576-3)
+            r'\b([A-Z]{2,4}[\s\-]?\d{1,4}[\s\-]\d{1,2}[A-Z]?)\b',
+            # Pattern: Letters + space/dash + digits + optional letter (FFL 019N, ABC 123X)
+            r'\b([A-Z]{2,4}[\s\-]\d{2,4}[A-Z]?)\b',
+            # Pattern: Compact format - letters immediately followed by digits + optional letter (JTY5763, FFL019N)
+            r'\b([A-Z]{2,4}\d{3,6}[A-Z]?)\b',
+            # Pattern: Single letter + digits (A12345)
+            r'\b([A-Z]\d{4,6}[A-Z]?)\b',
+            # Pattern: Digits + dash/space + digits (123-456)
+            r'\b(\d{3,6}[\s\-]\d{1,4})\b',
+        ]
+
+        all_matches = []
+        for pattern in customer_code_patterns:
+            matches = re.findall(pattern, text, re.IGNORECASE)
+            all_matches.extend(matches)
+
+        if all_matches:
+            # Prefer longer matches and those appearing later in text (after names)
+            # Sort by position in text (later = better) and length (longer = better)
+            scored_matches = []
+            for match in all_matches:
+                pos = text.upper().rfind(match.upper())
+                # Score: position * 0.1 + length (prefer later and longer)
+                score = pos * 0.1 + len(match)
+                scored_matches.append((score, match))
+
+            best_match = max(scored_matches, key=lambda x: x[0])[1]
+            return best_match.strip().upper(), True, None
+
+        # Pattern 2: Look for explicit labels
+        labeled_patterns = [
+            r'(?:kund(?:nr|nummer|id)?|ert?\s*(?:kund)?(?:nr|nummer)?|customer\s*(?:no|number|id)?)\s*[:\.]?\s*([A-Za-z0-9][\w\s\-]{1,20}?)(?:\s{2,}|\n|$)',
+        ]
+
+        for pattern in labeled_patterns:
+            match = re.search(pattern, text, re.IGNORECASE)
+            if match:
+                extracted = match.group(1).strip()
+                extracted = re.sub(r'[\s\.\,\:]+$', '', extracted)
+                if extracted and len(extracted) >= 2:
+                    return extracted.upper(), True, None
+
+        # Pattern 3: If text contains comma (likely "NAME, NAME CODE"), extract after last comma
+        if ',' in text:
+            after_comma = text.split(',')[-1].strip()
+            # Look for alphanumeric code in the part after comma
+            for pattern in customer_code_patterns[:3]:  # Use first 3 patterns
+                code_match = re.search(pattern, after_comma, re.IGNORECASE)
+                if code_match:
+                    return code_match.group(1).strip().upper(), True, None
+
+        # Pattern 4: Short text - filter out name-like words
+        if len(text) <= 20:
+            words = text.split()
+            code_parts = []
+            for word in words:
+                # Keep if: contains digits, or is short uppercase (likely abbreviation)
+                if re.search(r'\d', word) or (len(word) <= 4 and word.isupper()):
+                    code_parts.append(word)
+            if code_parts:
+                result = ' '.join(code_parts).upper()
+                if len(result) >= 3:
+                    return result, True, None
+
+        # Fallback: return cleaned text if reasonable
+        if text and 3 <= len(text) <= 15:
+            return text.upper(), True, None
+
+        return None, False, f"Cannot extract customer number from: {text[:50]}"
+
    def extract_all_fields(
        self,
        detections: list[Detection],