""" Tests for the Field Matching Module. Tests cover all matcher functions in src/matcher/field_matcher.py Usage: pytest src/matcher/test_field_matcher.py -v -o 'addopts=' """ import pytest from dataclasses import dataclass from shared.matcher.field_matcher import FieldMatcher, find_field_matches from shared.matcher.models import Match from shared.matcher.token_index import TokenIndex from shared.matcher.context import CONTEXT_KEYWORDS, find_context_keywords from shared.matcher import utils as matcher_utils from shared.matcher.utils import normalize_dashes as _normalize_dashes from shared.matcher.strategies import ( SubstringMatcher, FlexibleDateMatcher, FuzzyMatcher, ) @dataclass class MockToken: """Mock token for testing.""" text: str bbox: tuple[float, float, float, float] page_no: int = 0 class TestNormalizeDashes: """Tests for _normalize_dashes function.""" def test_normalize_en_dash(self): """Should normalize en-dash to hyphen.""" assert _normalize_dashes("123\u2013456") == "123-456" def test_normalize_em_dash(self): """Should normalize em-dash to hyphen.""" assert _normalize_dashes("123\u2014456") == "123-456" def test_normalize_minus_sign(self): """Should normalize minus sign to hyphen.""" assert _normalize_dashes("123\u2212456") == "123-456" def test_normalize_middle_dot(self): """Should normalize middle dot to hyphen.""" assert _normalize_dashes("123\u00b7456") == "123-456" def test_normal_hyphen_unchanged(self): """Should keep normal hyphen unchanged.""" assert _normalize_dashes("123-456") == "123-456" class TestTokenIndex: """Tests for TokenIndex class.""" def test_build_index(self): """Should build spatial index from tokens.""" tokens = [ MockToken("hello", (0, 0, 50, 20)), MockToken("world", (60, 0, 110, 20)), ] index = TokenIndex(tokens) assert len(index.tokens) == 2 def test_get_center(self): """Should return correct center coordinates.""" token = MockToken("test", (0, 0, 100, 50)) tokens = [token] index = TokenIndex(tokens) center = index.get_center(token) assert center == (50.0, 25.0) def test_get_text_lower(self): """Should return lowercase text.""" token = MockToken("HELLO World", (0, 0, 100, 20)) tokens = [token] index = TokenIndex(tokens) assert index.get_text_lower(token) == "hello world" def test_find_nearby_within_radius(self): """Should find tokens within radius.""" token1 = MockToken("hello", (0, 0, 50, 20)) token2 = MockToken("world", (60, 0, 110, 20)) # 60px away token3 = MockToken("far", (500, 0, 550, 20)) # 500px away tokens = [token1, token2, token3] index = TokenIndex(tokens) nearby = index.find_nearby(token1, radius=100) assert len(nearby) == 1 assert nearby[0].text == "world" def test_find_nearby_excludes_self(self): """Should not include the target token itself.""" token1 = MockToken("hello", (0, 0, 50, 20)) token2 = MockToken("world", (60, 0, 110, 20)) tokens = [token1, token2] index = TokenIndex(tokens) nearby = index.find_nearby(token1, radius=100) assert token1 not in nearby def test_find_nearby_empty_when_none_in_range(self): """Should return empty list when no tokens in range.""" token1 = MockToken("hello", (0, 0, 50, 20)) token2 = MockToken("far", (500, 0, 550, 20)) tokens = [token1, token2] index = TokenIndex(tokens) nearby = index.find_nearby(token1, radius=50) assert len(nearby) == 0 class TestMatch: """Tests for Match dataclass.""" def test_match_creation(self): """Should create Match with all fields.""" match = Match( field="InvoiceNumber", value="12345", bbox=(0, 0, 100, 20), page_no=0, score=0.95, matched_text="12345", context_keywords=["fakturanr"] ) assert match.field == "InvoiceNumber" assert match.value == "12345" assert match.score == 0.95 def test_to_yolo_format(self): """Should convert to YOLO annotation format.""" match = Match( field="Amount", value="100", bbox=(100, 200, 200, 250), # x0, y0, x1, y1 page_no=0, score=1.0, matched_text="100", context_keywords=[] ) # Image: 1000x1000 yolo = match.to_yolo_format(1000, 1000, class_id=5) # Expected: center_x=150, center_y=225, width=100, height=50 # Normalized: x_center=0.15, y_center=0.225, w=0.1, h=0.05 assert yolo.startswith("5 ") parts = yolo.split() assert len(parts) == 5 assert float(parts[1]) == pytest.approx(0.15, rel=1e-4) assert float(parts[2]) == pytest.approx(0.225, rel=1e-4) assert float(parts[3]) == pytest.approx(0.1, rel=1e-4) assert float(parts[4]) == pytest.approx(0.05, rel=1e-4) class TestFieldMatcher: """Tests for FieldMatcher class.""" def test_init_defaults(self): """Should initialize with default values.""" matcher = FieldMatcher() assert matcher.context_radius == 200.0 assert matcher.min_score_threshold == 0.5 def test_init_custom_params(self): """Should initialize with custom parameters.""" matcher = FieldMatcher(context_radius=300.0, min_score_threshold=0.7) assert matcher.context_radius == 300.0 assert matcher.min_score_threshold == 0.7 class TestFieldMatcherExactMatch: """Tests for exact matching.""" def test_exact_match_full_score(self): """Should find exact match with full score.""" matcher = FieldMatcher() tokens = [MockToken("12345", (0, 0, 50, 20))] matches = matcher.find_matches(tokens, "InvoiceNumber", ["12345"]) assert len(matches) >= 1 assert matches[0].score == 1.0 assert matches[0].matched_text == "12345" def test_case_insensitive_match(self): """Should find case-insensitive match with lower score.""" matcher = FieldMatcher() tokens = [MockToken("HELLO", (0, 0, 50, 20))] matches = matcher.find_matches(tokens, "InvoiceNumber", ["hello"]) assert len(matches) >= 1 assert matches[0].score == 0.95 def test_digits_only_match(self): """Should match by digits only for numeric fields.""" matcher = FieldMatcher() tokens = [MockToken("INV-12345", (0, 0, 80, 20))] matches = matcher.find_matches(tokens, "InvoiceNumber", ["12345"]) assert len(matches) >= 1 assert matches[0].score == 0.9 def test_no_match_when_different(self): """Should return empty when no match found.""" matcher = FieldMatcher(min_score_threshold=0.8) tokens = [MockToken("99999", (0, 0, 50, 20))] matches = matcher.find_matches(tokens, "InvoiceNumber", ["12345"]) assert len(matches) == 0 class TestFieldMatcherContextKeywords: """Tests for context keyword boosting.""" def test_context_boost_with_nearby_keyword(self): """Should boost score when context keyword is nearby.""" matcher = FieldMatcher(context_radius=200) tokens = [ MockToken("fakturanr", (0, 0, 80, 20)), # Context keyword MockToken("12345", (100, 0, 150, 20)), # Value ] matches = matcher.find_matches(tokens, "InvoiceNumber", ["12345"]) assert len(matches) >= 1 # Score should be boosted above 1.0 (capped at 1.0) assert matches[0].score == 1.0 assert "fakturanr" in matches[0].context_keywords def test_no_boost_when_keyword_far_away(self): """Should not boost when keyword is too far.""" matcher = FieldMatcher(context_radius=50) tokens = [ MockToken("fakturanr", (0, 0, 80, 20)), # Context keyword MockToken("12345", (500, 0, 550, 20)), # Value - far away ] matches = matcher.find_matches(tokens, "InvoiceNumber", ["12345"]) assert len(matches) >= 1 assert "fakturanr" not in matches[0].context_keywords class TestFieldMatcherConcatenatedMatch: """Tests for concatenated token matching.""" def test_concatenate_adjacent_tokens(self): """Should match value split across adjacent tokens.""" matcher = FieldMatcher() tokens = [ MockToken("123", (0, 0, 30, 20)), MockToken("456", (35, 0, 65, 20)), # Adjacent, same line ] matches = matcher.find_matches(tokens, "InvoiceNumber", ["123456"]) assert len(matches) >= 1 assert "123456" in matches[0].matched_text or matches[0].value == "123456" def test_no_concatenate_when_gap_too_large(self): """Should not concatenate when gap is too large.""" matcher = FieldMatcher() tokens = [ MockToken("123", (0, 0, 30, 20)), MockToken("456", (100, 0, 130, 20)), # Gap > 50px ] # This might still match if exact matches work differently matches = matcher.find_matches(tokens, "InvoiceNumber", ["123456"]) # No concatenated match expected (only from exact/substring) concat_matches = [m for m in matches if "123456" in m.matched_text] # May or may not find depending on strategy class TestFieldMatcherSubstringMatch: """Tests for substring matching.""" def test_substring_match_in_longer_text(self): """Should find value as substring in longer token.""" matcher = FieldMatcher() tokens = [MockToken("Fakturanummer: 12345", (0, 0, 150, 20))] matches = matcher.find_matches(tokens, "InvoiceNumber", ["12345"]) assert len(matches) >= 1 # Substring match should have lower score substring_match = [m for m in matches if "12345" in m.matched_text] assert len(substring_match) >= 1 def test_no_substring_match_when_part_of_larger_number(self): """Should not match when value is part of a larger number.""" matcher = FieldMatcher(min_score_threshold=0.6) tokens = [MockToken("123456789", (0, 0, 100, 20))] matches = matcher.find_matches(tokens, "InvoiceNumber", ["456"]) # Should not match because 456 is embedded in larger number assert len(matches) == 0 class TestFieldMatcherFuzzyMatch: """Tests for fuzzy amount matching.""" def test_fuzzy_amount_match(self): """Should match amounts that are numerically equal.""" matcher = FieldMatcher() tokens = [MockToken("1234,56", (0, 0, 70, 20))] matches = matcher.find_matches(tokens, "Amount", ["1234.56"]) assert len(matches) >= 1 def test_fuzzy_amount_with_different_formats(self): """Should match amounts in different formats.""" matcher = FieldMatcher() tokens = [MockToken("1 234,56", (0, 0, 80, 20))] matches = matcher.find_matches(tokens, "Amount", ["1234,56"]) assert len(matches) >= 1 class TestFieldMatcherParseAmount: """Tests for parse_amount function.""" def test_parse_simple_integer(self): """Should parse simple integer.""" assert matcher_utils.parse_amount("100") == 100.0 def test_parse_decimal_with_dot(self): """Should parse decimal with dot.""" assert matcher_utils.parse_amount("100.50") == 100.50 def test_parse_decimal_with_comma(self): """Should parse decimal with comma (European format).""" assert matcher_utils.parse_amount("100,50") == 100.50 def test_parse_with_thousand_separator(self): """Should parse with thousand separator.""" assert matcher_utils.parse_amount("1 234,56") == 1234.56 def test_parse_with_currency_suffix(self): """Should parse and remove currency suffix.""" assert matcher_utils.parse_amount("100 SEK") == 100.0 assert matcher_utils.parse_amount("100 kr") == 100.0 def test_parse_swedish_ore_format(self): """Should parse Swedish öre format (kronor space öre).""" assert matcher_utils.parse_amount("239 00") == 239.00 assert matcher_utils.parse_amount("1234 50") == 1234.50 def test_parse_invalid_returns_none(self): """Should return None for invalid input.""" assert matcher_utils.parse_amount("abc") is None assert matcher_utils.parse_amount("") is None class TestFieldMatcherTokensOnSameLine: """Tests for tokens_on_same_line function.""" def test_same_line_tokens(self): """Should detect tokens on same line.""" token1 = MockToken("hello", (0, 10, 50, 30)) token2 = MockToken("world", (60, 12, 110, 28)) # Slight y variation assert matcher_utils.tokens_on_same_line(token1, token2) is True def test_different_line_tokens(self): """Should detect tokens on different lines.""" token1 = MockToken("hello", (0, 10, 50, 30)) token2 = MockToken("world", (0, 50, 50, 70)) # Different y assert matcher_utils.tokens_on_same_line(token1, token2) is False class TestFieldMatcherBboxOverlap: """Tests for bbox_overlap function.""" def test_full_overlap(self): """Should return 1.0 for identical bboxes.""" bbox = (0, 0, 100, 50) assert matcher_utils.bbox_overlap(bbox, bbox) == 1.0 def test_partial_overlap(self): """Should calculate partial overlap correctly.""" bbox1 = (0, 0, 100, 100) bbox2 = (50, 50, 150, 150) # 50% overlap on each axis overlap = matcher_utils.bbox_overlap(bbox1, bbox2) # Intersection: 50x50=2500, Union: 10000+10000-2500=17500 # IoU = 2500/17500 ≈ 0.143 assert 0.1 < overlap < 0.2 def test_no_overlap(self): """Should return 0.0 for non-overlapping bboxes.""" bbox1 = (0, 0, 50, 50) bbox2 = (100, 100, 150, 150) assert matcher_utils.bbox_overlap(bbox1, bbox2) == 0.0 class TestFieldMatcherDeduplication: """Tests for match deduplication.""" def test_deduplicate_overlapping_matches(self): """Should keep only highest scoring match for overlapping bboxes.""" matcher = FieldMatcher() tokens = [ MockToken("12345", (0, 0, 50, 20)), ] # Find matches with multiple values that could match same token matches = matcher.find_matches(tokens, "InvoiceNumber", ["12345", "12345"]) # Should deduplicate to single match assert len(matches) == 1 class TestFieldMatcherFlexibleDateMatch: """Tests for flexible date matching.""" def test_flexible_date_same_month(self): """Should match dates in same year-month when exact match fails.""" matcher = FieldMatcher() tokens = [ MockToken("2025-01-15", (0, 0, 80, 20)), # Slightly different day ] # Search for different day in same month matches = matcher.find_matches( tokens, "InvoiceDate", ["2025-01-10"] ) # Should find flexible match (lower score) # Note: This depends on exact match failing first # If exact match works, flexible won't be tried class TestFieldMatcherPageFiltering: """Tests for page number filtering.""" def test_filters_by_page_number(self): """Should only match tokens on specified page.""" matcher = FieldMatcher() tokens = [ MockToken("12345", (0, 0, 50, 20), page_no=0), MockToken("12345", (0, 0, 50, 20), page_no=1), ] matches = matcher.find_matches(tokens, "InvoiceNumber", ["12345"], page_no=0) assert all(m.page_no == 0 for m in matches) def test_excludes_hidden_tokens(self): """Should exclude tokens with negative y coordinates (metadata).""" matcher = FieldMatcher() tokens = [ MockToken("12345", (0, -100, 50, -80), page_no=0), # Hidden metadata MockToken("67890", (0, 0, 50, 20), page_no=0), # Visible ] matches = matcher.find_matches(tokens, "InvoiceNumber", ["12345"], page_no=0) # Should not match the hidden token assert len(matches) == 0 class TestContextKeywordsMapping: """Tests for CONTEXT_KEYWORDS constant.""" def test_all_fields_have_keywords(self): """Should have keywords for all expected fields.""" expected_fields = [ "InvoiceNumber", "InvoiceDate", "InvoiceDueDate", "OCR", "Bankgiro", "Plusgiro", "Amount", "supplier_organisation_number", "supplier_accounts", ] for field in expected_fields: assert field in CONTEXT_KEYWORDS assert len(CONTEXT_KEYWORDS[field]) > 0 def test_keywords_are_lowercase(self): """All keywords should be lowercase.""" for field, keywords in CONTEXT_KEYWORDS.items(): for kw in keywords: assert kw == kw.lower(), f"Keyword '{kw}' in {field} should be lowercase" class TestFindFieldMatches: """Tests for find_field_matches convenience function.""" def test_finds_multiple_fields(self): """Should find matches for multiple fields.""" tokens = [ MockToken("12345", (0, 0, 50, 20)), MockToken("100,00", (0, 30, 60, 50)), ] field_values = { "InvoiceNumber": "12345", "Amount": "100", } results = find_field_matches(tokens, field_values) assert "InvoiceNumber" in results assert "Amount" in results assert len(results["InvoiceNumber"]) >= 1 assert len(results["Amount"]) >= 1 def test_skips_empty_values(self): """Should skip fields with None or empty values.""" tokens = [MockToken("12345", (0, 0, 50, 20))] field_values = { "InvoiceNumber": "12345", "Amount": None, "OCR": "", } results = find_field_matches(tokens, field_values) assert "InvoiceNumber" in results assert "Amount" not in results assert "OCR" not in results class TestSubstringMatchEdgeCases: """Additional edge case tests for substring matching.""" def test_unsupported_field_returns_empty(self): """Should return empty for unsupported field types.""" # Line 380: field_name not in supported_fields substring_matcher = SubstringMatcher() tokens = [MockToken("Faktura: 12345", (0, 0, 100, 20))] # Message is not a supported field for substring matching matches = substring_matcher.find_matches(tokens, "12345", "Message") assert len(matches) == 0 def test_case_insensitive_substring_match(self): """Should find case-insensitive substring match.""" # Line 397-398: case-insensitive substring matching substring_matcher = SubstringMatcher() # Use token without inline keyword to isolate case-insensitive behavior tokens = [MockToken("REF: ABC123", (0, 0, 100, 20))] matches = substring_matcher.find_matches(tokens, "abc123", "InvoiceNumber") assert len(matches) >= 1 # Case-insensitive base score is 0.70 (vs 0.75 for case-sensitive) # Score may have context boost but base should be lower assert matches[0].score <= 0.80 # 0.70 base + possible small boost def test_substring_with_digit_before(self): """Should not match when digit appears before value.""" # Line 407-408: char_before.isdigit() continue substring_matcher = SubstringMatcher() tokens = [MockToken("9912345", (0, 0, 60, 20))] matches = substring_matcher.find_matches(tokens, "12345", "InvoiceNumber") assert len(matches) == 0 def test_substring_with_digit_after(self): """Should not match when digit appears after value.""" # Line 413-416: char_after.isdigit() continue substring_matcher = SubstringMatcher() tokens = [MockToken("12345678", (0, 0, 70, 20))] matches = substring_matcher.find_matches(tokens, "12345", "InvoiceNumber") assert len(matches) == 0 def test_substring_with_inline_keyword(self): """Should boost score when keyword is in same token.""" substring_matcher = SubstringMatcher() tokens = [MockToken("Fakturanr: 12345", (0, 0, 100, 20))] matches = substring_matcher.find_matches(tokens, "12345", "InvoiceNumber") assert len(matches) >= 1 # Should have inline keyword boost assert "fakturanr" in matches[0].context_keywords class TestFlexibleDateMatchEdgeCases: """Additional edge case tests for flexible date matching.""" def test_no_valid_date_in_normalized_values(self): """Should return empty when no valid date in normalized values.""" # Line 520-521, 524: target_date parsing failures date_matcher = FlexibleDateMatcher() tokens = [MockToken("2025-01-15", (0, 0, 80, 20))] # Pass non-date value matches = date_matcher.find_matches( tokens, "not-a-date", "InvoiceDate" ) assert len(matches) == 0 def test_no_date_tokens_found(self): """Should return empty when no date tokens in document.""" # Line 571-572: no date_candidates date_matcher = FlexibleDateMatcher() tokens = [MockToken("Hello World", (0, 0, 80, 20))] matches = date_matcher.find_matches( tokens, "2025-01-15", "InvoiceDate" ) assert len(matches) == 0 def test_flexible_date_within_7_days(self): """Should score higher for dates within 7 days.""" # Line 582-583: days_diff <= 7 date_matcher = FlexibleDateMatcher() tokens = [ MockToken("2025-01-18", (0, 0, 80, 20)), # 3 days from target ] matches = date_matcher.find_matches( tokens, "2025-01-15", "InvoiceDate" ) assert len(matches) >= 1 assert matches[0].score >= 0.75 def test_flexible_date_within_3_days(self): """Should score highest for dates within 3 days.""" # Line 584-585: days_diff <= 3 date_matcher = FlexibleDateMatcher() tokens = [ MockToken("2025-01-17", (0, 0, 80, 20)), # 2 days from target ] matches = date_matcher.find_matches( tokens, "2025-01-15", "InvoiceDate" ) assert len(matches) >= 1 assert matches[0].score >= 0.8 def test_flexible_date_within_14_days_different_month(self): """Should match dates within 14 days even in different month.""" # Line 587-588: days_diff <= 14, different year-month date_matcher = FlexibleDateMatcher() tokens = [ MockToken("2025-02-05", (0, 0, 80, 20)), # 10 days from Jan 26 ] matches = date_matcher.find_matches( tokens, "2025-01-26", "InvoiceDate" ) assert len(matches) >= 1 def test_flexible_date_within_30_days(self): """Should match dates within 30 days with lower score.""" # Line 589-590: days_diff <= 30 date_matcher = FlexibleDateMatcher() tokens = [ MockToken("2025-02-10", (0, 0, 80, 20)), # 25 days from target ] matches = date_matcher.find_matches( tokens, "2025-01-16", "InvoiceDate" ) assert len(matches) >= 1 assert matches[0].score >= 0.55 def test_flexible_date_far_apart_without_context(self): """Should skip dates too far apart without context keywords.""" # Line 591-595: > 30 days, no context date_matcher = FlexibleDateMatcher() tokens = [ MockToken("2025-06-15", (0, 0, 80, 20)), # Many months from target ] matches = date_matcher.find_matches( tokens, "2025-01-15", "InvoiceDate" ) # Should be empty - too far apart and no context assert len(matches) == 0 def test_flexible_date_far_with_context(self): """Should match distant dates if context keywords present.""" # Line 592-595: > 30 days but has context date_matcher = FlexibleDateMatcher(context_radius=200) tokens = [ MockToken("fakturadatum", (0, 0, 80, 20)), # Context keyword MockToken("2025-06-15", (90, 0, 170, 20)), # Distant date ] matches = date_matcher.find_matches( tokens, "2025-01-15", "InvoiceDate" ) # May match due to context keyword # (depends on how context is detected in flexible match) def test_flexible_date_boost_with_context(self): """Should boost flexible date score with context keywords.""" # Line 598, 602-603: context_boost applied date_matcher = FlexibleDateMatcher(context_radius=200) tokens = [ MockToken("fakturadatum", (0, 0, 80, 20)), MockToken("2025-01-18", (90, 0, 170, 20)), # 3 days from target ] matches = date_matcher.find_matches( tokens, "2025-01-15", "InvoiceDate" ) if len(matches) > 0: assert len(matches[0].context_keywords) >= 0 class TestContextKeywordFallback: """Tests for context keyword lookup fallback (no spatial index).""" def test_fallback_context_lookup_without_index(self): """Should find context using O(n) scan when no index available.""" # Line 650-673: fallback context lookup matcher = FieldMatcher(context_radius=200) # Don't use find_matches which builds index, call internal method directly tokens = [ MockToken("fakturanr", (0, 0, 80, 20)), MockToken("12345", (100, 0, 150, 20)), ] # _token_index is None, so fallback is used keywords, boost = find_context_keywords(tokens, tokens[1], "InvoiceNumber", 200.0) assert "fakturanr" in keywords assert boost > 0 def test_context_lookup_skips_self(self): """Should skip the target token itself in fallback search.""" # Line 656-657: token is target_token continue matcher = FieldMatcher(context_radius=200) matcher._token_index = None # Force fallback token = MockToken("fakturanr 12345", (0, 0, 150, 20)) tokens = [token] keywords, boost = find_context_keywords(tokens, token, "InvoiceNumber", 200.0) # Token contains keyword but is the target - should still find if keyword in token # Actually this tests that it doesn't error when target is in list class TestFieldWithoutContextKeywords: """Tests for fields without defined context keywords.""" def test_field_without_keywords_returns_empty(self): """Should return empty keywords for fields not in CONTEXT_KEYWORDS.""" # Line 633-635: keywords empty, return early matcher = FieldMatcher() matcher._token_index = None tokens = [MockToken("hello", (0, 0, 50, 20))] # customer_number is not in CONTEXT_KEYWORDS keywords, boost = find_context_keywords(tokens, tokens[0], "UnknownField", 200.0) assert keywords == [] assert boost == 0.0 class TestParseAmountEdgeCases: """Additional edge case tests for _parse_amount.""" def test_parse_amount_with_parentheses(self): """Should remove parenthesized text like (inkl. moms).""" matcher = FieldMatcher() result = matcher_utils.parse_amount("100 (inkl. moms)") assert result == 100.0 def test_parse_amount_with_kronor_suffix(self): """Should handle 'kronor' suffix.""" matcher = FieldMatcher() result = matcher_utils.parse_amount("100 kronor") assert result == 100.0 def test_parse_amount_numeric_input(self): """Should handle numeric input (int/float).""" matcher = FieldMatcher() assert matcher_utils.parse_amount(100) == 100.0 assert matcher_utils.parse_amount(100.5) == 100.5 class TestFuzzyMatchExceptionHandling: """Tests for exception handling in fuzzy matching.""" def test_fuzzy_match_with_unparseable_token(self): """Should handle tokens that can't be parsed as amounts.""" # Line 481-482: except clause in fuzzy matching matcher = FieldMatcher() # Create a token that will cause parse issues tokens = [MockToken("abc xyz", (0, 0, 50, 20))] # This should not raise, just return empty matches matches = FuzzyMatcher().find_matches(tokens, "100", "Amount") assert len(matches) == 0 def test_fuzzy_match_exception_in_context_lookup(self): """Should catch exceptions during fuzzy match processing.""" # After refactoring, context lookup is in separate module # This test is no longer applicable as we use find_context_keywords function # Instead, we test that fuzzy matcher handles unparseable amounts gracefully fuzzy_matcher = FuzzyMatcher() tokens = [MockToken("not-a-number", (0, 0, 50, 20))] # Should not crash on unparseable amount matches = fuzzy_matcher.find_matches(tokens, "100", "Amount") assert len(matches) == 0 class TestFlexibleDateInvalidDateParsing: """Tests for invalid date parsing in flexible date matching.""" def test_invalid_date_in_normalized_values(self): """Should handle invalid dates in normalized values gracefully.""" # Line 520-521: ValueError continue in target date parsing date_matcher = FlexibleDateMatcher() tokens = [MockToken("2025-01-15", (0, 0, 80, 20))] # Pass an invalid date that matches the pattern but is not a valid date # e.g., 2025-13-45 matches pattern but month 13 is invalid matches = date_matcher.find_matches( tokens, "2025-13-45", "InvoiceDate" ) # Should return empty as no valid target date could be parsed assert len(matches) == 0 def test_invalid_date_token_in_document(self): """Should skip invalid date-like tokens in document.""" # Line 568-569: ValueError continue in date token parsing date_matcher = FlexibleDateMatcher() tokens = [ MockToken("2025-99-99", (0, 0, 80, 20)), # Invalid date in doc MockToken("2025-01-18", (0, 50, 80, 70)), # Valid date ] matches = date_matcher.find_matches( tokens, "2025-01-15", "InvoiceDate" ) # Should only match the valid date assert len(matches) >= 1 assert matches[0].value == "2025-01-18" def test_flexible_date_with_inline_keyword(self): """Should detect inline keywords in date tokens.""" # Line 555: inline_keywords append date_matcher = FlexibleDateMatcher() tokens = [ MockToken("Fakturadatum: 2025-01-18", (0, 0, 150, 20)), ] matches = date_matcher.find_matches( tokens, "2025-01-15", "InvoiceDate" ) # Should find match with inline keyword assert len(matches) >= 1 assert "fakturadatum" in matches[0].context_keywords if __name__ == "__main__": pytest.main([__file__, "-v"])