Files
invoice-master-poc-v2/packages/shared/shared/matcher/strategies/exact_matcher.py
2026-01-27 23:58:17 +01:00

66 lines
2.1 KiB
Python

"""
Exact match strategy.
"""
from .base import BaseMatchStrategy
from ..models import TokenLike, Match
from ..token_index import TokenIndex
from ..context import find_context_keywords
from ..utils import NON_DIGIT_PATTERN
class ExactMatcher(BaseMatchStrategy):
"""Find tokens that exactly match the value."""
def find_matches(
self,
tokens: list[TokenLike],
value: str,
field_name: str,
token_index: TokenIndex | None = None
) -> list[Match]:
"""Find exact matches."""
matches = []
value_lower = value.lower()
value_digits = NON_DIGIT_PATTERN.sub('', value) if field_name in (
'InvoiceNumber', 'OCR', 'Bankgiro', 'Plusgiro',
'supplier_organisation_number', 'supplier_accounts'
) else None
for token in tokens:
token_text = token.text.strip()
# Exact match
if token_text == value:
score = 1.0
# Case-insensitive match (use cached lowercase from index)
elif token_index and token_index.get_text_lower(token).strip() == value_lower:
score = 0.95
# Digits-only match for numeric fields
elif value_digits is not None:
token_digits = NON_DIGIT_PATTERN.sub('', token_text)
if token_digits and token_digits == value_digits:
score = 0.9
else:
continue
else:
continue
# Boost score if context keywords are nearby
context_keywords, context_boost = find_context_keywords(
tokens, token, field_name, self.context_radius, token_index
)
score = min(1.0, score + context_boost)
matches.append(Match(
field=field_name,
value=value,
bbox=token.bbox,
page_no=token.page_no,
score=score,
matched_text=token_text,
context_keywords=context_keywords
))
return matches