Re-structure the project.

2026-01-25 15:21:11 +01:00
parent 8fd61ea928
commit e599424a92
80 changed files with 10672 additions and 1584 deletions
--- a/src/matcher/context.py
+++ b/src/matcher/context.py
@@ -0,0 +1,92 @@
+"""
+Context keywords for field matching.
+"""
+
+from .models import TokenLike
+from .token_index import TokenIndex
+
+
+# Context keywords for each field type (Swedish invoice terms)
+CONTEXT_KEYWORDS = {
+    'InvoiceNumber': ['fakturanr', 'fakturanummer', 'invoice', 'inv.nr', 'inv nr', 'nr'],
+    'InvoiceDate': ['fakturadatum', 'datum', 'date', 'utfärdad', 'utskriftsdatum', 'dokumentdatum'],
+    'InvoiceDueDate': ['förfallodatum', 'förfaller', 'due date', 'betalas senast', 'att betala senast',
+                       'förfallodag', 'oss tillhanda senast', 'senast'],
+    'OCR': ['ocr', 'referens', 'betalningsreferens', 'ref'],
+    'Bankgiro': ['bankgiro', 'bg', 'bg-nr', 'bg nr'],
+    'Plusgiro': ['plusgiro', 'pg', 'pg-nr', 'pg nr'],
+    'Amount': ['att betala', 'summa', 'total', 'belopp', 'amount', 'totalt', 'att erlägga', 'sek', 'kr'],
+    'supplier_organisation_number': ['organisationsnummer', 'org.nr', 'org nr', 'orgnr', 'org.nummer',
+                                      'momsreg', 'momsnr', 'moms nr', 'vat', 'corporate id'],
+    'supplier_accounts': ['konto', 'kontonr', 'konto nr', 'account', 'klientnr', 'kundnr'],
+}
+
+
+def find_context_keywords(
+    tokens: list[TokenLike],
+    target_token: TokenLike,
+    field_name: str,
+    context_radius: float,
+    token_index: TokenIndex | None = None
+) -> tuple[list[str], float]:
+    """
+    Find context keywords near the target token.
+
+    Uses spatial index for O(1) average lookup instead of O(n) scan.
+
+    Args:
+        tokens: List of all tokens
+        target_token: The token to find context for
+        field_name: Name of the field
+        context_radius: Search radius in pixels
+        token_index: Optional spatial index for efficient lookup
+
+    Returns:
+        Tuple of (found_keywords, boost_score)
+    """
+    keywords = CONTEXT_KEYWORDS.get(field_name, [])
+    if not keywords:
+        return [], 0.0
+
+    found_keywords = []
+
+    # Use spatial index for efficient nearby token lookup
+    if token_index:
+        nearby_tokens = token_index.find_nearby(target_token, context_radius)
+        for token in nearby_tokens:
+            # Use cached lowercase text
+            token_lower = token_index.get_text_lower(token)
+            for keyword in keywords:
+                if keyword in token_lower:
+                    found_keywords.append(keyword)
+    else:
+        # Fallback to O(n) scan if no index available
+        target_center = (
+            (target_token.bbox[0] + target_token.bbox[2]) / 2,
+            (target_token.bbox[1] + target_token.bbox[3]) / 2
+        )
+
+        for token in tokens:
+            if token is target_token:
+                continue
+
+            token_center = (
+                (token.bbox[0] + token.bbox[2]) / 2,
+                (token.bbox[1] + token.bbox[3]) / 2
+            )
+
+            distance = (
+                (target_center[0] - token_center[0]) ** 2 +
+                (target_center[1] - token_center[1]) ** 2
+            ) ** 0.5
+
+            if distance <= context_radius:
+                token_lower = token.text.lower()
+                for keyword in keywords:
+                    if keyword in token_lower:
+                        found_keywords.append(keyword)
+
+    # Calculate boost based on keywords found
+    # Increased boost to better differentiate matches with/without context
+    boost = min(0.25, len(found_keywords) * 0.10)
+    return found_keywords, boost