invoice-master-poc-v2/packages/shared/shared/matcher/strategies/exact_matcher.py

"""
Exact match strategy.
"""

from .base import BaseMatchStrategy
from ..models import TokenLike, Match
from ..token_index import TokenIndex
from ..context import find_context_keywords
from ..utils import NON_DIGIT_PATTERN


class ExactMatcher(BaseMatchStrategy):
    """Find tokens that exactly match the value."""

    def find_matches(
        self,
        tokens: list[TokenLike],
        value: str,
        field_name: str,
        token_index: TokenIndex | None = None
    ) -> list[Match]:
        """Find exact matches."""
        matches = []
        value_lower = value.lower()
        value_digits = NON_DIGIT_PATTERN.sub('', value) if field_name in (
            'InvoiceNumber', 'OCR', 'Bankgiro', 'Plusgiro',
            'supplier_organisation_number', 'supplier_accounts'
        ) else None

        for token in tokens:
            token_text = token.text.strip()

            # Exact match
            if token_text == value:
                score = 1.0
            # Case-insensitive match (use cached lowercase from index)
            elif token_index and token_index.get_text_lower(token).strip() == value_lower:
                score = 0.95
            # Digits-only match for numeric fields
            elif value_digits is not None:
                token_digits = NON_DIGIT_PATTERN.sub('', token_text)
                if token_digits and token_digits == value_digits:
                    score = 0.9
                else:
                    continue
            else:
                continue

            # Boost score if context keywords are nearby
            context_keywords, context_boost = find_context_keywords(
                tokens, token, field_name, self.context_radius, token_index
            )
            score = min(1.0, score + context_boost)

            matches.append(Match(
                field=field_name,
                value=value,
                bbox=token.bbox,
                page_no=token.page_no,
                score=score,
                matched_text=token_text,
                context_keywords=context_keywords
            ))

        return matches