WIP

2026-01-16 23:10:01 +01:00
parent 53d1e8db25
commit 425b8fdedf
10 changed files with 653 additions and 87 deletions
--- a/src/matcher/field_matcher.py
+++ b/src/matcher/field_matcher.py
@@ -14,6 +14,12 @@ from functools import cached_property
 _DATE_PATTERN = re.compile(r'(\d{4})-(\d{2})-(\d{2})')
 _WHITESPACE_PATTERN = re.compile(r'\s+')
 _NON_DIGIT_PATTERN = re.compile(r'\D')
+_DASH_PATTERN = re.compile(r'[\u2013\u2014\u2212]')  # en-dash, em-dash, minus sign
+
+
+def _normalize_dashes(text: str) -> str:
+    """Normalize different dash types to standard hyphen-minus (ASCII 45)."""
+    return _DASH_PATTERN.sub('-', text)


 class TokenLike(Protocol):
@@ -143,6 +149,9 @@ CONTEXT_KEYWORDS = {
    'Bankgiro': ['bankgiro', 'bg', 'bg-nr', 'bg nr'],
    'Plusgiro': ['plusgiro', 'pg', 'pg-nr', 'pg nr'],
    'Amount': ['att betala', 'summa', 'total', 'belopp', 'amount', 'totalt', 'att erlägga', 'sek', 'kr'],
+    'supplier_organisation_number': ['organisationsnummer', 'org.nr', 'org nr', 'orgnr', 'org.nummer',
+                                      'momsreg', 'momsnr', 'moms nr', 'vat', 'corporate id'],
+    'supplier_accounts': ['konto', 'kontonr', 'konto nr', 'account', 'klientnr', 'kundnr'],
 }


@@ -207,7 +216,10 @@ class FieldMatcher:

            # Strategy 4: Substring match (for values embedded in longer text)
            # e.g., "Fakturanummer: 2465027205" should match OCR value "2465027205"
-            if field_name in ('InvoiceDate', 'InvoiceDueDate', 'InvoiceNumber', 'OCR', 'Bankgiro', 'Plusgiro', 'Amount'):
+            # Note: Amount is excluded because short numbers like "451" can incorrectly match
+            # in OCR payment lines or other unrelated text
+            if field_name in ('InvoiceDate', 'InvoiceDueDate', 'InvoiceNumber', 'OCR', 'Bankgiro', 'Plusgiro',
+                              'supplier_organisation_number', 'supplier_accounts'):
                substring_matches = self._find_substring_matches(page_tokens, value, field_name)
                matches.extend(substring_matches)

@@ -237,7 +249,8 @@ class FieldMatcher:
        """Find tokens that exactly match the value."""
        matches = []
        value_lower = value.lower()
-        value_digits = _NON_DIGIT_PATTERN.sub('', value) if field_name in ('InvoiceNumber', 'OCR', 'Bankgiro', 'Plusgiro') else None
+        value_digits = _NON_DIGIT_PATTERN.sub('', value) if field_name in ('InvoiceNumber', 'OCR', 'Bankgiro', 'Plusgiro',
+                                                                            'supplier_organisation_number', 'supplier_accounts') else None

        for token in tokens:
            token_text = token.text.strip()
@@ -355,33 +368,36 @@ class FieldMatcher:
        matches = []

        # Supported fields for substring matching
-        supported_fields = ('InvoiceDate', 'InvoiceDueDate', 'InvoiceNumber', 'OCR', 'Bankgiro', 'Plusgiro', 'Amount')
+        supported_fields = ('InvoiceDate', 'InvoiceDueDate', 'InvoiceNumber', 'OCR', 'Bankgiro', 'Plusgiro', 'Amount',
+                            'supplier_organisation_number', 'supplier_accounts')
        if field_name not in supported_fields:
            return matches

        for token in tokens:
            token_text = token.text.strip()
+            # Normalize different dash types to hyphen-minus for matching
+            token_text_normalized = _normalize_dashes(token_text)

            # Skip if token is the same length as value (would be exact match)
-            if len(token_text) <= len(value):
+            if len(token_text_normalized) <= len(value):
                continue

-            # Check if value appears as substring
-            if value in token_text:
+            # Check if value appears as substring (using normalized text)
+            if value in token_text_normalized:
                # Verify it's a proper boundary match (not part of a larger number)
-                idx = token_text.find(value)
+                idx = token_text_normalized.find(value)

                # Check character before (if exists)
                if idx > 0:
-                    char_before = token_text[idx - 1]
+                    char_before = token_text_normalized[idx - 1]
                    # Must be non-digit (allow : space - etc)
                    if char_before.isdigit():
                        continue

                # Check character after (if exists)
                end_idx = idx + len(value)
-                if end_idx < len(token_text):
-                    char_after = token_text[end_idx]
+                if end_idx < len(token_text_normalized):
+                    char_after = token_text_normalized[end_idx]
                    # Must be non-digit
                    if char_after.isdigit():
                        continue