invoice-master-poc-v2/tests/matcher/test_field_matcher.py

"""
Tests for the Field Matching Module.

Tests cover all matcher functions in src/matcher/field_matcher.py

Usage:
    pytest src/matcher/test_field_matcher.py -v -o 'addopts='
"""

import pytest
from dataclasses import dataclass
from src.matcher.field_matcher import FieldMatcher, find_field_matches
from src.matcher.models import Match
from src.matcher.token_index import TokenIndex
from src.matcher.context import CONTEXT_KEYWORDS, find_context_keywords
from src.matcher import utils as matcher_utils
from src.matcher.utils import normalize_dashes as _normalize_dashes
from src.matcher.strategies import (
    SubstringMatcher,
    FlexibleDateMatcher,
    FuzzyMatcher,
)


@dataclass
class MockToken:
    """Mock token for testing."""
    text: str
    bbox: tuple[float, float, float, float]
    page_no: int = 0


class TestNormalizeDashes:
    """Tests for _normalize_dashes function."""

    def test_normalize_en_dash(self):
        """Should normalize en-dash to hyphen."""
        assert _normalize_dashes("123\u2013456") == "123-456"

    def test_normalize_em_dash(self):
        """Should normalize em-dash to hyphen."""
        assert _normalize_dashes("123\u2014456") == "123-456"

    def test_normalize_minus_sign(self):
        """Should normalize minus sign to hyphen."""
        assert _normalize_dashes("123\u2212456") == "123-456"

    def test_normalize_middle_dot(self):
        """Should normalize middle dot to hyphen."""
        assert _normalize_dashes("123\u00b7456") == "123-456"

    def test_normal_hyphen_unchanged(self):
        """Should keep normal hyphen unchanged."""
        assert _normalize_dashes("123-456") == "123-456"


class TestTokenIndex:
    """Tests for TokenIndex class."""

    def test_build_index(self):
        """Should build spatial index from tokens."""
        tokens = [
            MockToken("hello", (0, 0, 50, 20)),
            MockToken("world", (60, 0, 110, 20)),
        ]
        index = TokenIndex(tokens)
        assert len(index.tokens) == 2

    def test_get_center(self):
        """Should return correct center coordinates."""
        token = MockToken("test", (0, 0, 100, 50))
        tokens = [token]
        index = TokenIndex(tokens)
        center = index.get_center(token)
        assert center == (50.0, 25.0)

    def test_get_text_lower(self):
        """Should return lowercase text."""
        token = MockToken("HELLO World", (0, 0, 100, 20))
        tokens = [token]
        index = TokenIndex(tokens)
        assert index.get_text_lower(token) == "hello world"

    def test_find_nearby_within_radius(self):
        """Should find tokens within radius."""
        token1 = MockToken("hello", (0, 0, 50, 20))
        token2 = MockToken("world", (60, 0, 110, 20))  # 60px away
        token3 = MockToken("far", (500, 0, 550, 20))  # 500px away
        tokens = [token1, token2, token3]
        index = TokenIndex(tokens)

        nearby = index.find_nearby(token1, radius=100)
        assert len(nearby) == 1
        assert nearby[0].text == "world"

    def test_find_nearby_excludes_self(self):
        """Should not include the target token itself."""
        token1 = MockToken("hello", (0, 0, 50, 20))
        token2 = MockToken("world", (60, 0, 110, 20))
        tokens = [token1, token2]
        index = TokenIndex(tokens)

        nearby = index.find_nearby(token1, radius=100)
        assert token1 not in nearby

    def test_find_nearby_empty_when_none_in_range(self):
        """Should return empty list when no tokens in range."""
        token1 = MockToken("hello", (0, 0, 50, 20))
        token2 = MockToken("far", (500, 0, 550, 20))
        tokens = [token1, token2]
        index = TokenIndex(tokens)

        nearby = index.find_nearby(token1, radius=50)
        assert len(nearby) == 0


class TestMatch:
    """Tests for Match dataclass."""

    def test_match_creation(self):
        """Should create Match with all fields."""
        match = Match(
            field="InvoiceNumber",
            value="12345",
            bbox=(0, 0, 100, 20),
            page_no=0,
            score=0.95,
            matched_text="12345",
            context_keywords=["fakturanr"]
        )
        assert match.field == "InvoiceNumber"
        assert match.value == "12345"
        assert match.score == 0.95

    def test_to_yolo_format(self):
        """Should convert to YOLO annotation format."""
        match = Match(
            field="Amount",
            value="100",
            bbox=(100, 200, 200, 250),  # x0, y0, x1, y1
            page_no=0,
            score=1.0,
            matched_text="100",
            context_keywords=[]
        )
        # Image: 1000x1000
        yolo = match.to_yolo_format(1000, 1000, class_id=5)

        # Expected: center_x=150, center_y=225, width=100, height=50
        # Normalized: x_center=0.15, y_center=0.225, w=0.1, h=0.05
        assert yolo.startswith("5 ")
        parts = yolo.split()
        assert len(parts) == 5
        assert float(parts[1]) == pytest.approx(0.15, rel=1e-4)
        assert float(parts[2]) == pytest.approx(0.225, rel=1e-4)
        assert float(parts[3]) == pytest.approx(0.1, rel=1e-4)
        assert float(parts[4]) == pytest.approx(0.05, rel=1e-4)


class TestFieldMatcher:
    """Tests for FieldMatcher class."""

    def test_init_defaults(self):
        """Should initialize with default values."""
        matcher = FieldMatcher()
        assert matcher.context_radius == 200.0
        assert matcher.min_score_threshold == 0.5

    def test_init_custom_params(self):
        """Should initialize with custom parameters."""
        matcher = FieldMatcher(context_radius=300.0, min_score_threshold=0.7)
        assert matcher.context_radius == 300.0
        assert matcher.min_score_threshold == 0.7


class TestFieldMatcherExactMatch:
    """Tests for exact matching."""

    def test_exact_match_full_score(self):
        """Should find exact match with full score."""
        matcher = FieldMatcher()
        tokens = [MockToken("12345", (0, 0, 50, 20))]

        matches = matcher.find_matches(tokens, "InvoiceNumber", ["12345"])

        assert len(matches) >= 1
        assert matches[0].score == 1.0
        assert matches[0].matched_text == "12345"

    def test_case_insensitive_match(self):
        """Should find case-insensitive match with lower score."""
        matcher = FieldMatcher()
        tokens = [MockToken("HELLO", (0, 0, 50, 20))]

        matches = matcher.find_matches(tokens, "InvoiceNumber", ["hello"])

        assert len(matches) >= 1
        assert matches[0].score == 0.95

    def test_digits_only_match(self):
        """Should match by digits only for numeric fields."""
        matcher = FieldMatcher()
        tokens = [MockToken("INV-12345", (0, 0, 80, 20))]

        matches = matcher.find_matches(tokens, "InvoiceNumber", ["12345"])

        assert len(matches) >= 1
        assert matches[0].score == 0.9

    def test_no_match_when_different(self):
        """Should return empty when no match found."""
        matcher = FieldMatcher(min_score_threshold=0.8)
        tokens = [MockToken("99999", (0, 0, 50, 20))]

        matches = matcher.find_matches(tokens, "InvoiceNumber", ["12345"])

        assert len(matches) == 0


class TestFieldMatcherContextKeywords:
    """Tests for context keyword boosting."""

    def test_context_boost_with_nearby_keyword(self):
        """Should boost score when context keyword is nearby."""
        matcher = FieldMatcher(context_radius=200)
        tokens = [
            MockToken("fakturanr", (0, 0, 80, 20)),  # Context keyword
            MockToken("12345", (100, 0, 150, 20)),   # Value
        ]

        matches = matcher.find_matches(tokens, "InvoiceNumber", ["12345"])

        assert len(matches) >= 1
        # Score should be boosted above 1.0 (capped at 1.0)
        assert matches[0].score == 1.0
        assert "fakturanr" in matches[0].context_keywords

    def test_no_boost_when_keyword_far_away(self):
        """Should not boost when keyword is too far."""
        matcher = FieldMatcher(context_radius=50)
        tokens = [
            MockToken("fakturanr", (0, 0, 80, 20)),   # Context keyword
            MockToken("12345", (500, 0, 550, 20)),   # Value - far away
        ]

        matches = matcher.find_matches(tokens, "InvoiceNumber", ["12345"])

        assert len(matches) >= 1
        assert "fakturanr" not in matches[0].context_keywords


class TestFieldMatcherConcatenatedMatch:
    """Tests for concatenated token matching."""

    def test_concatenate_adjacent_tokens(self):
        """Should match value split across adjacent tokens."""
        matcher = FieldMatcher()
        tokens = [
            MockToken("123", (0, 0, 30, 20)),
            MockToken("456", (35, 0, 65, 20)),  # Adjacent, same line
        ]

        matches = matcher.find_matches(tokens, "InvoiceNumber", ["123456"])

        assert len(matches) >= 1
        assert "123456" in matches[0].matched_text or matches[0].value == "123456"

    def test_no_concatenate_when_gap_too_large(self):
        """Should not concatenate when gap is too large."""
        matcher = FieldMatcher()
        tokens = [
            MockToken("123", (0, 0, 30, 20)),
            MockToken("456", (100, 0, 130, 20)),  # Gap > 50px
        ]

        # This might still match if exact matches work differently
        matches = matcher.find_matches(tokens, "InvoiceNumber", ["123456"])
        # No concatenated match expected (only from exact/substring)
        concat_matches = [m for m in matches if "123456" in m.matched_text]
        # May or may not find depending on strategy


class TestFieldMatcherSubstringMatch:
    """Tests for substring matching."""

    def test_substring_match_in_longer_text(self):
        """Should find value as substring in longer token."""
        matcher = FieldMatcher()
        tokens = [MockToken("Fakturanummer: 12345", (0, 0, 150, 20))]

        matches = matcher.find_matches(tokens, "InvoiceNumber", ["12345"])

        assert len(matches) >= 1
        # Substring match should have lower score
        substring_match = [m for m in matches if "12345" in m.matched_text]
        assert len(substring_match) >= 1

    def test_no_substring_match_when_part_of_larger_number(self):
        """Should not match when value is part of a larger number."""
        matcher = FieldMatcher(min_score_threshold=0.6)
        tokens = [MockToken("123456789", (0, 0, 100, 20))]

        matches = matcher.find_matches(tokens, "InvoiceNumber", ["456"])

        # Should not match because 456 is embedded in larger number
        assert len(matches) == 0


class TestFieldMatcherFuzzyMatch:
    """Tests for fuzzy amount matching."""

    def test_fuzzy_amount_match(self):
        """Should match amounts that are numerically equal."""
        matcher = FieldMatcher()
        tokens = [MockToken("1234,56", (0, 0, 70, 20))]

        matches = matcher.find_matches(tokens, "Amount", ["1234.56"])

        assert len(matches) >= 1

    def test_fuzzy_amount_with_different_formats(self):
        """Should match amounts in different formats."""
        matcher = FieldMatcher()
        tokens = [MockToken("1 234,56", (0, 0, 80, 20))]

        matches = matcher.find_matches(tokens, "Amount", ["1234,56"])

        assert len(matches) >= 1


class TestFieldMatcherParseAmount:
    """Tests for parse_amount function."""

    def test_parse_simple_integer(self):
        """Should parse simple integer."""
        assert matcher_utils.parse_amount("100") == 100.0

    def test_parse_decimal_with_dot(self):
        """Should parse decimal with dot."""
        assert matcher_utils.parse_amount("100.50") == 100.50

    def test_parse_decimal_with_comma(self):
        """Should parse decimal with comma (European format)."""
        assert matcher_utils.parse_amount("100,50") == 100.50

    def test_parse_with_thousand_separator(self):
        """Should parse with thousand separator."""
        assert matcher_utils.parse_amount("1 234,56") == 1234.56

    def test_parse_with_currency_suffix(self):
        """Should parse and remove currency suffix."""
        assert matcher_utils.parse_amount("100 SEK") == 100.0
        assert matcher_utils.parse_amount("100 kr") == 100.0

    def test_parse_swedish_ore_format(self):
        """Should parse Swedish öre format (kronor space öre)."""
        assert matcher_utils.parse_amount("239 00") == 239.00
        assert matcher_utils.parse_amount("1234 50") == 1234.50

    def test_parse_invalid_returns_none(self):
        """Should return None for invalid input."""
        assert matcher_utils.parse_amount("abc") is None
        assert matcher_utils.parse_amount("") is None


class TestFieldMatcherTokensOnSameLine:
    """Tests for tokens_on_same_line function."""

    def test_same_line_tokens(self):
        """Should detect tokens on same line."""
        token1 = MockToken("hello", (0, 10, 50, 30))
        token2 = MockToken("world", (60, 12, 110, 28))  # Slight y variation

        assert matcher_utils.tokens_on_same_line(token1, token2) is True

    def test_different_line_tokens(self):
        """Should detect tokens on different lines."""
        token1 = MockToken("hello", (0, 10, 50, 30))
        token2 = MockToken("world", (0, 50, 50, 70))  # Different y

        assert matcher_utils.tokens_on_same_line(token1, token2) is False


class TestFieldMatcherBboxOverlap:
    """Tests for bbox_overlap function."""

    def test_full_overlap(self):
        """Should return 1.0 for identical bboxes."""
        bbox = (0, 0, 100, 50)
        assert matcher_utils.bbox_overlap(bbox, bbox) == 1.0

    def test_partial_overlap(self):
        """Should calculate partial overlap correctly."""
        bbox1 = (0, 0, 100, 100)
        bbox2 = (50, 50, 150, 150)  # 50% overlap on each axis

        overlap = matcher_utils.bbox_overlap(bbox1, bbox2)
        # Intersection: 50x50=2500, Union: 10000+10000-2500=17500
        # IoU = 2500/17500 ≈ 0.143
        assert 0.1 < overlap < 0.2

    def test_no_overlap(self):
        """Should return 0.0 for non-overlapping bboxes."""
        bbox1 = (0, 0, 50, 50)
        bbox2 = (100, 100, 150, 150)

        assert matcher_utils.bbox_overlap(bbox1, bbox2) == 0.0


class TestFieldMatcherDeduplication:
    """Tests for match deduplication."""

    def test_deduplicate_overlapping_matches(self):
        """Should keep only highest scoring match for overlapping bboxes."""
        matcher = FieldMatcher()
        tokens = [
            MockToken("12345", (0, 0, 50, 20)),
        ]

        # Find matches with multiple values that could match same token
        matches = matcher.find_matches(tokens, "InvoiceNumber", ["12345", "12345"])

        # Should deduplicate to single match
        assert len(matches) == 1


class TestFieldMatcherFlexibleDateMatch:
    """Tests for flexible date matching."""

    def test_flexible_date_same_month(self):
        """Should match dates in same year-month when exact match fails."""
        matcher = FieldMatcher()
        tokens = [
            MockToken("2025-01-15", (0, 0, 80, 20)),  # Slightly different day
        ]

        # Search for different day in same month
        matches = matcher.find_matches(
            tokens, "InvoiceDate", ["2025-01-10"]
        )

        # Should find flexible match (lower score)
        # Note: This depends on exact match failing first
        # If exact match works, flexible won't be tried


class TestFieldMatcherPageFiltering:
    """Tests for page number filtering."""

    def test_filters_by_page_number(self):
        """Should only match tokens on specified page."""
        matcher = FieldMatcher()
        tokens = [
            MockToken("12345", (0, 0, 50, 20), page_no=0),
            MockToken("12345", (0, 0, 50, 20), page_no=1),
        ]

        matches = matcher.find_matches(tokens, "InvoiceNumber", ["12345"], page_no=0)

        assert all(m.page_no == 0 for m in matches)

    def test_excludes_hidden_tokens(self):
        """Should exclude tokens with negative y coordinates (metadata)."""
        matcher = FieldMatcher()
        tokens = [
            MockToken("12345", (0, -100, 50, -80), page_no=0),  # Hidden metadata
            MockToken("67890", (0, 0, 50, 20), page_no=0),      # Visible
        ]

        matches = matcher.find_matches(tokens, "InvoiceNumber", ["12345"], page_no=0)

        # Should not match the hidden token
        assert len(matches) == 0


class TestContextKeywordsMapping:
    """Tests for CONTEXT_KEYWORDS constant."""

    def test_all_fields_have_keywords(self):
        """Should have keywords for all expected fields."""
        expected_fields = [
            "InvoiceNumber",
            "InvoiceDate",
            "InvoiceDueDate",
            "OCR",
            "Bankgiro",
            "Plusgiro",
            "Amount",
            "supplier_organisation_number",
            "supplier_accounts",
        ]
        for field in expected_fields:
            assert field in CONTEXT_KEYWORDS
            assert len(CONTEXT_KEYWORDS[field]) > 0

    def test_keywords_are_lowercase(self):
        """All keywords should be lowercase."""
        for field, keywords in CONTEXT_KEYWORDS.items():
            for kw in keywords:
                assert kw == kw.lower(), f"Keyword '{kw}' in {field} should be lowercase"


class TestFindFieldMatches:
    """Tests for find_field_matches convenience function."""

    def test_finds_multiple_fields(self):
        """Should find matches for multiple fields."""
        tokens = [
            MockToken("12345", (0, 0, 50, 20)),
            MockToken("100,00", (0, 30, 60, 50)),
        ]
        field_values = {
            "InvoiceNumber": "12345",
            "Amount": "100",
        }

        results = find_field_matches(tokens, field_values)

        assert "InvoiceNumber" in results
        assert "Amount" in results
        assert len(results["InvoiceNumber"]) >= 1
        assert len(results["Amount"]) >= 1

    def test_skips_empty_values(self):
        """Should skip fields with None or empty values."""
        tokens = [MockToken("12345", (0, 0, 50, 20))]
        field_values = {
            "InvoiceNumber": "12345",
            "Amount": None,
            "OCR": "",
        }

        results = find_field_matches(tokens, field_values)

        assert "InvoiceNumber" in results
        assert "Amount" not in results
        assert "OCR" not in results


class TestSubstringMatchEdgeCases:
    """Additional edge case tests for substring matching."""

    def test_unsupported_field_returns_empty(self):
        """Should return empty for unsupported field types."""
        # Line 380: field_name not in supported_fields
        substring_matcher = SubstringMatcher()
        tokens = [MockToken("Faktura: 12345", (0, 0, 100, 20))]

        # Message is not a supported field for substring matching
        matches = substring_matcher.find_matches(tokens, "12345", "Message")
        assert len(matches) == 0

    def test_case_insensitive_substring_match(self):
        """Should find case-insensitive substring match."""
        # Line 397-398: case-insensitive substring matching
        substring_matcher = SubstringMatcher()
        # Use token without inline keyword to isolate case-insensitive behavior
        tokens = [MockToken("REF: ABC123", (0, 0, 100, 20))]

        matches = substring_matcher.find_matches(tokens, "abc123", "InvoiceNumber")

        assert len(matches) >= 1
        # Case-insensitive base score is 0.70 (vs 0.75 for case-sensitive)
        # Score may have context boost but base should be lower
        assert matches[0].score <= 0.80  # 0.70 base + possible small boost

    def test_substring_with_digit_before(self):
        """Should not match when digit appears before value."""
        # Line 407-408: char_before.isdigit() continue
        substring_matcher = SubstringMatcher()
        tokens = [MockToken("9912345", (0, 0, 60, 20))]

        matches = substring_matcher.find_matches(tokens, "12345", "InvoiceNumber")
        assert len(matches) == 0

    def test_substring_with_digit_after(self):
        """Should not match when digit appears after value."""
        # Line 413-416: char_after.isdigit() continue
        substring_matcher = SubstringMatcher()
        tokens = [MockToken("12345678", (0, 0, 70, 20))]

        matches = substring_matcher.find_matches(tokens, "12345", "InvoiceNumber")
        assert len(matches) == 0

    def test_substring_with_inline_keyword(self):
        """Should boost score when keyword is in same token."""
        substring_matcher = SubstringMatcher()
        tokens = [MockToken("Fakturanr: 12345", (0, 0, 100, 20))]

        matches = substring_matcher.find_matches(tokens, "12345", "InvoiceNumber")

        assert len(matches) >= 1
        # Should have inline keyword boost
        assert "fakturanr" in matches[0].context_keywords


class TestFlexibleDateMatchEdgeCases:
    """Additional edge case tests for flexible date matching."""

    def test_no_valid_date_in_normalized_values(self):
        """Should return empty when no valid date in normalized values."""
        # Line 520-521, 524: target_date parsing failures
        date_matcher = FlexibleDateMatcher()
        tokens = [MockToken("2025-01-15", (0, 0, 80, 20))]

        # Pass non-date value
        matches = date_matcher.find_matches(
            tokens, "not-a-date", "InvoiceDate"
        )
        assert len(matches) == 0

    def test_no_date_tokens_found(self):
        """Should return empty when no date tokens in document."""
        # Line 571-572: no date_candidates
        date_matcher = FlexibleDateMatcher()
        tokens = [MockToken("Hello World", (0, 0, 80, 20))]

        matches = date_matcher.find_matches(
            tokens, "2025-01-15", "InvoiceDate"
        )
        assert len(matches) == 0

    def test_flexible_date_within_7_days(self):
        """Should score higher for dates within 7 days."""
        # Line 582-583: days_diff <= 7
        date_matcher = FlexibleDateMatcher()
        tokens = [
            MockToken("2025-01-18", (0, 0, 80, 20)),  # 3 days from target
        ]

        matches = date_matcher.find_matches(
            tokens, "2025-01-15", "InvoiceDate"
        )

        assert len(matches) >= 1
        assert matches[0].score >= 0.75

    def test_flexible_date_within_3_days(self):
        """Should score highest for dates within 3 days."""
        # Line 584-585: days_diff <= 3
        date_matcher = FlexibleDateMatcher()
        tokens = [
            MockToken("2025-01-17", (0, 0, 80, 20)),  # 2 days from target
        ]

        matches = date_matcher.find_matches(
            tokens, "2025-01-15", "InvoiceDate"
        )

        assert len(matches) >= 1
        assert matches[0].score >= 0.8

    def test_flexible_date_within_14_days_different_month(self):
        """Should match dates within 14 days even in different month."""
        # Line 587-588: days_diff <= 14, different year-month
        date_matcher = FlexibleDateMatcher()
        tokens = [
            MockToken("2025-02-05", (0, 0, 80, 20)),  # 10 days from Jan 26
        ]

        matches = date_matcher.find_matches(
            tokens, "2025-01-26", "InvoiceDate"
        )

        assert len(matches) >= 1

    def test_flexible_date_within_30_days(self):
        """Should match dates within 30 days with lower score."""
        # Line 589-590: days_diff <= 30
        date_matcher = FlexibleDateMatcher()
        tokens = [
            MockToken("2025-02-10", (0, 0, 80, 20)),  # 25 days from target
        ]

        matches = date_matcher.find_matches(
            tokens, "2025-01-16", "InvoiceDate"
        )

        assert len(matches) >= 1
        assert matches[0].score >= 0.55

    def test_flexible_date_far_apart_without_context(self):
        """Should skip dates too far apart without context keywords."""
        # Line 591-595: > 30 days, no context
        date_matcher = FlexibleDateMatcher()
        tokens = [
            MockToken("2025-06-15", (0, 0, 80, 20)),  # Many months from target
        ]

        matches = date_matcher.find_matches(
            tokens, "2025-01-15", "InvoiceDate"
        )

        # Should be empty - too far apart and no context
        assert len(matches) == 0

    def test_flexible_date_far_with_context(self):
        """Should match distant dates if context keywords present."""
        # Line 592-595: > 30 days but has context
        date_matcher = FlexibleDateMatcher(context_radius=200)
        tokens = [
            MockToken("fakturadatum", (0, 0, 80, 20)),  # Context keyword
            MockToken("2025-06-15", (90, 0, 170, 20)),  # Distant date
        ]

        matches = date_matcher.find_matches(
            tokens, "2025-01-15", "InvoiceDate"
        )

        # May match due to context keyword
        # (depends on how context is detected in flexible match)

    def test_flexible_date_boost_with_context(self):
        """Should boost flexible date score with context keywords."""
        # Line 598, 602-603: context_boost applied
        date_matcher = FlexibleDateMatcher(context_radius=200)
        tokens = [
            MockToken("fakturadatum", (0, 0, 80, 20)),
            MockToken("2025-01-18", (90, 0, 170, 20)),  # 3 days from target
        ]

        matches = date_matcher.find_matches(
            tokens, "2025-01-15", "InvoiceDate"
        )

        if len(matches) > 0:
            assert len(matches[0].context_keywords) >= 0


class TestContextKeywordFallback:
    """Tests for context keyword lookup fallback (no spatial index)."""

    def test_fallback_context_lookup_without_index(self):
        """Should find context using O(n) scan when no index available."""
        # Line 650-673: fallback context lookup
        matcher = FieldMatcher(context_radius=200)
        # Don't use find_matches which builds index, call internal method directly

        tokens = [
            MockToken("fakturanr", (0, 0, 80, 20)),
            MockToken("12345", (100, 0, 150, 20)),
        ]

        # _token_index is None, so fallback is used
        keywords, boost = find_context_keywords(tokens, tokens[1], "InvoiceNumber", 200.0)

        assert "fakturanr" in keywords
        assert boost > 0

    def test_context_lookup_skips_self(self):
        """Should skip the target token itself in fallback search."""
        # Line 656-657: token is target_token continue
        matcher = FieldMatcher(context_radius=200)
        matcher._token_index = None  # Force fallback

        token = MockToken("fakturanr 12345", (0, 0, 150, 20))
        tokens = [token]

        keywords, boost = find_context_keywords(tokens, token, "InvoiceNumber", 200.0)

        # Token contains keyword but is the target - should still find if keyword in token
        # Actually this tests that it doesn't error when target is in list


class TestFieldWithoutContextKeywords:
    """Tests for fields without defined context keywords."""

    def test_field_without_keywords_returns_empty(self):
        """Should return empty keywords for fields not in CONTEXT_KEYWORDS."""
        # Line 633-635: keywords empty, return early
        matcher = FieldMatcher()
        matcher._token_index = None

        tokens = [MockToken("hello", (0, 0, 50, 20))]

        # customer_number is not in CONTEXT_KEYWORDS
        keywords, boost = find_context_keywords(tokens, tokens[0], "UnknownField", 200.0)

        assert keywords == []
        assert boost == 0.0


class TestParseAmountEdgeCases:
    """Additional edge case tests for _parse_amount."""

    def test_parse_amount_with_parentheses(self):
        """Should remove parenthesized text like (inkl. moms)."""
        matcher = FieldMatcher()
        result = matcher_utils.parse_amount("100 (inkl. moms)")
        assert result == 100.0

    def test_parse_amount_with_kronor_suffix(self):
        """Should handle 'kronor' suffix."""
        matcher = FieldMatcher()
        result = matcher_utils.parse_amount("100 kronor")
        assert result == 100.0

    def test_parse_amount_numeric_input(self):
        """Should handle numeric input (int/float)."""
        matcher = FieldMatcher()
        assert matcher_utils.parse_amount(100) == 100.0
        assert matcher_utils.parse_amount(100.5) == 100.5


class TestFuzzyMatchExceptionHandling:
    """Tests for exception handling in fuzzy matching."""

    def test_fuzzy_match_with_unparseable_token(self):
        """Should handle tokens that can't be parsed as amounts."""
        # Line 481-482: except clause in fuzzy matching
        matcher = FieldMatcher()
        # Create a token that will cause parse issues
        tokens = [MockToken("abc xyz", (0, 0, 50, 20))]

        # This should not raise, just return empty matches
        matches = FuzzyMatcher().find_matches(tokens, "100", "Amount")
        assert len(matches) == 0

    def test_fuzzy_match_exception_in_context_lookup(self):
        """Should catch exceptions during fuzzy match processing."""
        # After refactoring, context lookup is in separate module
        # This test is no longer applicable as we use find_context_keywords function
        # Instead, we test that fuzzy matcher handles unparseable amounts gracefully
        fuzzy_matcher = FuzzyMatcher()
        tokens = [MockToken("not-a-number", (0, 0, 50, 20))]

        # Should not crash on unparseable amount
        matches = fuzzy_matcher.find_matches(tokens, "100", "Amount")
        assert len(matches) == 0


class TestFlexibleDateInvalidDateParsing:
    """Tests for invalid date parsing in flexible date matching."""

    def test_invalid_date_in_normalized_values(self):
        """Should handle invalid dates in normalized values gracefully."""
        # Line 520-521: ValueError continue in target date parsing
        date_matcher = FlexibleDateMatcher()
        tokens = [MockToken("2025-01-15", (0, 0, 80, 20))]

        # Pass an invalid date that matches the pattern but is not a valid date
        # e.g., 2025-13-45 matches pattern but month 13 is invalid
        matches = date_matcher.find_matches(
            tokens, "2025-13-45", "InvoiceDate"
        )
        # Should return empty as no valid target date could be parsed
        assert len(matches) == 0

    def test_invalid_date_token_in_document(self):
        """Should skip invalid date-like tokens in document."""
        # Line 568-569: ValueError continue in date token parsing
        date_matcher = FlexibleDateMatcher()
        tokens = [
            MockToken("2025-99-99", (0, 0, 80, 20)),  # Invalid date in doc
            MockToken("2025-01-18", (0, 50, 80, 70)), # Valid date
        ]

        matches = date_matcher.find_matches(
            tokens, "2025-01-15", "InvoiceDate"
        )

        # Should only match the valid date
        assert len(matches) >= 1
        assert matches[0].value == "2025-01-18"

    def test_flexible_date_with_inline_keyword(self):
        """Should detect inline keywords in date tokens."""
        # Line 555: inline_keywords append
        date_matcher = FlexibleDateMatcher()
        tokens = [
            MockToken("Fakturadatum: 2025-01-18", (0, 0, 150, 20)),
        ]

        matches = date_matcher.find_matches(
            tokens, "2025-01-15", "InvoiceDate"
        )

        # Should find match with inline keyword
        assert len(matches) >= 1
        assert "fakturadatum" in matches[0].context_keywords


if __name__ == "__main__":
    pytest.main([__file__, "-v"])