code issue fix

2026-01-17 18:55:46 +01:00
parent 510890d18c
commit e9460e9f34
9 changed files with 729 additions and 57 deletions
--- a/src/matcher/field_matcher.py
+++ b/src/matcher/field_matcher.py
@@ -219,7 +219,7 @@ class FieldMatcher:
            # Note: Amount is excluded because short numbers like "451" can incorrectly match
            # in OCR payment lines or other unrelated text
            if field_name in ('InvoiceDate', 'InvoiceDueDate', 'InvoiceNumber', 'OCR', 'Bankgiro', 'Plusgiro',
-                              'supplier_organisation_number', 'supplier_accounts'):
+                              'supplier_organisation_number', 'supplier_accounts', 'customer_number'):
                substring_matches = self._find_substring_matches(page_tokens, value, field_name)
                matches.extend(substring_matches)

@@ -369,7 +369,7 @@ class FieldMatcher:

        # Supported fields for substring matching
        supported_fields = ('InvoiceDate', 'InvoiceDueDate', 'InvoiceNumber', 'OCR', 'Bankgiro', 'Plusgiro', 'Amount',
-                            'supplier_organisation_number', 'supplier_accounts')
+                            'supplier_organisation_number', 'supplier_accounts', 'customer_number')
        if field_name not in supported_fields:
            return matches

@@ -383,49 +383,59 @@ class FieldMatcher:
                continue

            # Check if value appears as substring (using normalized text)
+            # Try case-sensitive first, then case-insensitive
            if value in token_text_normalized:
-                # Verify it's a proper boundary match (not part of a larger number)
                idx = token_text_normalized.find(value)
+                case_sensitive_match = True
+            elif value.lower() in token_text_normalized.lower():
+                idx = token_text_normalized.lower().find(value.lower())
+                case_sensitive_match = False
+            else:
+                continue

-                # Check character before (if exists)
-                if idx > 0:
-                    char_before = token_text_normalized[idx - 1]
-                    # Must be non-digit (allow : space - etc)
-                    if char_before.isdigit():
-                        continue
+            # Verify it's a proper boundary match (not part of a larger number)
+            # Check character before (if exists)
+            if idx > 0:
+                char_before = token_text_normalized[idx - 1]
+                # Must be non-digit (allow : space - etc)
+                if char_before.isdigit():
+                    continue

-                # Check character after (if exists)
-                end_idx = idx + len(value)
-                if end_idx < len(token_text_normalized):
-                    char_after = token_text_normalized[end_idx]
-                    # Must be non-digit
-                    if char_after.isdigit():
-                        continue
+            # Check character after (if exists)
+            end_idx = idx + len(value)
+            if end_idx < len(token_text_normalized):
+                char_after = token_text_normalized[end_idx]
+                # Must be non-digit
+                if char_after.isdigit():
+                    continue

-                # Found valid substring match
-                context_keywords, context_boost = self._find_context_keywords(
-                    tokens, token, field_name
-                )
+            # Found valid substring match
+            context_keywords, context_boost = self._find_context_keywords(
+                tokens, token, field_name
+            )

-                # Check if context keyword is in the same token (like "Fakturadatum:")
-                token_lower = token_text.lower()
-                inline_context = []
-                for keyword in CONTEXT_KEYWORDS.get(field_name, []):
-                    if keyword in token_lower:
-                        inline_context.append(keyword)
+            # Check if context keyword is in the same token (like "Fakturadatum:")
+            token_lower = token_text.lower()
+            inline_context = []
+            for keyword in CONTEXT_KEYWORDS.get(field_name, []):
+                if keyword in token_lower:
+                    inline_context.append(keyword)

-                # Boost score if keyword is inline
-                inline_boost = 0.1 if inline_context else 0
+            # Boost score if keyword is inline
+            inline_boost = 0.1 if inline_context else 0

-                matches.append(Match(
-                    field=field_name,
-                    value=value,
-                    bbox=token.bbox,  # Use full token bbox
-                    page_no=token.page_no,
-                    score=min(1.0, 0.75 + context_boost + inline_boost),  # Lower than exact match
-                    matched_text=token_text,
-                    context_keywords=context_keywords + inline_context
-                ))
+            # Lower score for case-insensitive match
+            base_score = 0.75 if case_sensitive_match else 0.70
+
+            matches.append(Match(
+                field=field_name,
+                value=value,
+                bbox=token.bbox,  # Use full token bbox
+                page_no=token.page_no,
+                score=min(1.0, base_score + context_boost + inline_boost),
+                matched_text=token_text,
+                context_keywords=context_keywords + inline_context
+            ))

        return matches