""" Context keywords for field matching. """ from .models import TokenLike from .token_index import TokenIndex # Context keywords for each field type (Swedish invoice terms) CONTEXT_KEYWORDS = { 'InvoiceNumber': ['fakturanr', 'fakturanummer', 'invoice', 'inv.nr', 'inv nr', 'nr'], 'InvoiceDate': ['fakturadatum', 'datum', 'date', 'utfärdad', 'utskriftsdatum', 'dokumentdatum'], 'InvoiceDueDate': ['förfallodatum', 'förfaller', 'due date', 'betalas senast', 'att betala senast', 'förfallodag', 'oss tillhanda senast', 'senast'], 'OCR': ['ocr', 'referens', 'betalningsreferens', 'ref'], 'Bankgiro': ['bankgiro', 'bg', 'bg-nr', 'bg nr'], 'Plusgiro': ['plusgiro', 'pg', 'pg-nr', 'pg nr'], 'Amount': ['att betala', 'summa', 'total', 'belopp', 'amount', 'totalt', 'att erlägga', 'sek', 'kr'], 'supplier_organisation_number': ['organisationsnummer', 'org.nr', 'org nr', 'orgnr', 'org.nummer', 'momsreg', 'momsnr', 'moms nr', 'vat', 'corporate id'], 'supplier_accounts': ['konto', 'kontonr', 'konto nr', 'account', 'klientnr', 'kundnr'], } def find_context_keywords( tokens: list[TokenLike], target_token: TokenLike, field_name: str, context_radius: float, token_index: TokenIndex | None = None ) -> tuple[list[str], float]: """ Find context keywords near the target token. Uses spatial index for O(1) average lookup instead of O(n) scan. Args: tokens: List of all tokens target_token: The token to find context for field_name: Name of the field context_radius: Search radius in pixels token_index: Optional spatial index for efficient lookup Returns: Tuple of (found_keywords, boost_score) """ keywords = CONTEXT_KEYWORDS.get(field_name, []) if not keywords: return [], 0.0 found_keywords = [] # Use spatial index for efficient nearby token lookup if token_index: nearby_tokens = token_index.find_nearby(target_token, context_radius) for token in nearby_tokens: # Use cached lowercase text token_lower = token_index.get_text_lower(token) for keyword in keywords: if keyword in token_lower: found_keywords.append(keyword) else: # Fallback to O(n) scan if no index available target_center = ( (target_token.bbox[0] + target_token.bbox[2]) / 2, (target_token.bbox[1] + target_token.bbox[3]) / 2 ) for token in tokens: if token is target_token: continue token_center = ( (token.bbox[0] + token.bbox[2]) / 2, (token.bbox[1] + token.bbox[3]) / 2 ) distance = ( (target_center[0] - token_center[0]) ** 2 + (target_center[1] - token_center[1]) ** 2 ) ** 0.5 if distance <= context_radius: token_lower = token.text.lower() for keyword in keywords: if keyword in token_lower: found_keywords.append(keyword) # Calculate boost based on keywords found # Increased boost to better differentiate matches with/without context boost = min(0.25, len(found_keywords) * 0.10) return found_keywords, boost