Fix: Enable substring matching for OCR, InvoiceNumber, Bankgiro, Plusgiro

Previously substring matching was only enabled for date fields, causing OCR values embedded in longer tokens like "Fakturanummer: 2465027205" to not be matched. Changes: - Extended Strategy 4 (substring match) to numeric fields - Updated _find_substring_matches to support OCR, InvoiceNumber, Bankgiro, Plusgiro This should significantly improve match rates for these fields. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-10 17:49:27 +01:00
parent 8938661850
commit dd69fbe9ed
1 changed files with 12 additions and 8 deletions
--- a/src/matcher/field_matcher.py
+++ b/src/matcher/field_matcher.py
@@ -106,8 +106,9 @@ class FieldMatcher:
                fuzzy_matches = self._find_fuzzy_matches(page_tokens, value, field_name)
                matches.extend(fuzzy_matches)

-            # Strategy 4: Substring match (for dates embedded in longer text)
-            if field_name in ('InvoiceDate', 'InvoiceDueDate'):
+            # Strategy 4: Substring match (for values embedded in longer text)
+            # e.g., "Fakturanummer: 2465027205" should match OCR value "2465027205"
+            if field_name in ('InvoiceDate', 'InvoiceDueDate', 'InvoiceNumber', 'OCR', 'Bankgiro', 'Plusgiro'):
                substring_matches = self._find_substring_matches(page_tokens, value, field_name)
                matches.extend(substring_matches)

@@ -240,16 +241,19 @@ class FieldMatcher:
        """
        Find value as a substring within longer tokens.

-        Handles cases like 'Fakturadatum: 2026-01-09' where the date
-        is embedded in a longer text string.
+        Handles cases like:
+        - 'Fakturadatum: 2026-01-09' where the date is embedded
+        - 'Fakturanummer: 2465027205' where OCR/invoice number is embedded
+        - 'OCR: 1234567890' where reference number is embedded

-        Uses lower score (0.75) than exact match to prefer exact matches.
-        Only matches if the value appears as a distinct segment (not part of a number).
+        Uses lower score (0.75-0.85) than exact match to prefer exact matches.
+        Only matches if the value appears as a distinct segment (not part of a larger number).
        """
        matches = []

-        # Only use for date fields - other fields risk false positives
-        if field_name not in ('InvoiceDate', 'InvoiceDueDate'):
+        # Supported fields for substring matching
+        supported_fields = ('InvoiceDate', 'InvoiceDueDate', 'InvoiceNumber', 'OCR', 'Bankgiro', 'Plusgiro')
+        if field_name not in supported_fields:
            return matches

        for token in tokens: