66 lines
2.1 KiB
Python
66 lines
2.1 KiB
Python
"""
|
|
Exact match strategy.
|
|
"""
|
|
|
|
from .base import BaseMatchStrategy
|
|
from ..models import TokenLike, Match
|
|
from ..token_index import TokenIndex
|
|
from ..context import find_context_keywords
|
|
from ..utils import NON_DIGIT_PATTERN
|
|
|
|
|
|
class ExactMatcher(BaseMatchStrategy):
|
|
"""Find tokens that exactly match the value."""
|
|
|
|
def find_matches(
|
|
self,
|
|
tokens: list[TokenLike],
|
|
value: str,
|
|
field_name: str,
|
|
token_index: TokenIndex | None = None
|
|
) -> list[Match]:
|
|
"""Find exact matches."""
|
|
matches = []
|
|
value_lower = value.lower()
|
|
value_digits = NON_DIGIT_PATTERN.sub('', value) if field_name in (
|
|
'InvoiceNumber', 'OCR', 'Bankgiro', 'Plusgiro',
|
|
'supplier_organisation_number', 'supplier_accounts'
|
|
) else None
|
|
|
|
for token in tokens:
|
|
token_text = token.text.strip()
|
|
|
|
# Exact match
|
|
if token_text == value:
|
|
score = 1.0
|
|
# Case-insensitive match (use cached lowercase from index)
|
|
elif token_index and token_index.get_text_lower(token).strip() == value_lower:
|
|
score = 0.95
|
|
# Digits-only match for numeric fields
|
|
elif value_digits is not None:
|
|
token_digits = NON_DIGIT_PATTERN.sub('', token_text)
|
|
if token_digits and token_digits == value_digits:
|
|
score = 0.9
|
|
else:
|
|
continue
|
|
else:
|
|
continue
|
|
|
|
# Boost score if context keywords are nearby
|
|
context_keywords, context_boost = find_context_keywords(
|
|
tokens, token, field_name, self.context_radius, token_index
|
|
)
|
|
score = min(1.0, score + context_boost)
|
|
|
|
matches.append(Match(
|
|
field=field_name,
|
|
value=value,
|
|
bbox=token.bbox,
|
|
page_no=token.page_no,
|
|
score=score,
|
|
matched_text=token_text,
|
|
context_keywords=context_keywords
|
|
))
|
|
|
|
return matches
|