diff --git a/src/matcher/field_matcher.py b/src/matcher/field_matcher.py index c7b2531..9de1042 100644 --- a/src/matcher/field_matcher.py +++ b/src/matcher/field_matcher.py @@ -106,8 +106,9 @@ class FieldMatcher: fuzzy_matches = self._find_fuzzy_matches(page_tokens, value, field_name) matches.extend(fuzzy_matches) - # Strategy 4: Substring match (for dates embedded in longer text) - if field_name in ('InvoiceDate', 'InvoiceDueDate'): + # Strategy 4: Substring match (for values embedded in longer text) + # e.g., "Fakturanummer: 2465027205" should match OCR value "2465027205" + if field_name in ('InvoiceDate', 'InvoiceDueDate', 'InvoiceNumber', 'OCR', 'Bankgiro', 'Plusgiro'): substring_matches = self._find_substring_matches(page_tokens, value, field_name) matches.extend(substring_matches) @@ -240,16 +241,19 @@ class FieldMatcher: """ Find value as a substring within longer tokens. - Handles cases like 'Fakturadatum: 2026-01-09' where the date - is embedded in a longer text string. + Handles cases like: + - 'Fakturadatum: 2026-01-09' where the date is embedded + - 'Fakturanummer: 2465027205' where OCR/invoice number is embedded + - 'OCR: 1234567890' where reference number is embedded - Uses lower score (0.75) than exact match to prefer exact matches. - Only matches if the value appears as a distinct segment (not part of a number). + Uses lower score (0.75-0.85) than exact match to prefer exact matches. + Only matches if the value appears as a distinct segment (not part of a larger number). """ matches = [] - # Only use for date fields - other fields risk false positives - if field_name not in ('InvoiceDate', 'InvoiceDueDate'): + # Supported fields for substring matching + supported_fields = ('InvoiceDate', 'InvoiceDueDate', 'InvoiceNumber', 'OCR', 'Bankgiro', 'Plusgiro') + if field_name not in supported_fields: return matches for token in tokens: