Re-structure the project.
This commit is contained in:
52
src/matcher/strategies/fuzzy_matcher.py
Normal file
52
src/matcher/strategies/fuzzy_matcher.py
Normal file
@@ -0,0 +1,52 @@
|
||||
"""
|
||||
Fuzzy match strategy for amounts and dates.
|
||||
"""
|
||||
|
||||
from .base import BaseMatchStrategy
|
||||
from ..models import TokenLike, Match
|
||||
from ..token_index import TokenIndex
|
||||
from ..context import find_context_keywords
|
||||
from ..utils import parse_amount
|
||||
|
||||
|
||||
class FuzzyMatcher(BaseMatchStrategy):
|
||||
"""Find approximate matches for amounts and dates."""
|
||||
|
||||
def find_matches(
|
||||
self,
|
||||
tokens: list[TokenLike],
|
||||
value: str,
|
||||
field_name: str,
|
||||
token_index: TokenIndex | None = None
|
||||
) -> list[Match]:
|
||||
"""Find fuzzy matches."""
|
||||
matches = []
|
||||
|
||||
for token in tokens:
|
||||
token_text = token.text.strip()
|
||||
|
||||
if field_name == 'Amount':
|
||||
# Try to parse both as numbers
|
||||
try:
|
||||
token_num = parse_amount(token_text)
|
||||
value_num = parse_amount(value)
|
||||
|
||||
if token_num is not None and value_num is not None:
|
||||
if abs(token_num - value_num) < 0.01: # Within 1 cent
|
||||
context_keywords, context_boost = find_context_keywords(
|
||||
tokens, token, field_name, self.context_radius, token_index
|
||||
)
|
||||
|
||||
matches.append(Match(
|
||||
field=field_name,
|
||||
value=value,
|
||||
bbox=token.bbox,
|
||||
page_no=token.page_no,
|
||||
score=min(1.0, 0.8 + context_boost),
|
||||
matched_text=token_text,
|
||||
context_keywords=context_keywords
|
||||
))
|
||||
except:
|
||||
pass
|
||||
|
||||
return matches
|
||||
Reference in New Issue
Block a user